diff -Nur linux-4.2/Documentation/Makefile linux-4.2-pck/Documentation/Makefile --- linux-4.2/Documentation/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/Documentation/Makefile 2015-09-08 00:48:22.208364829 -0300 @@ -1,4 +1,4 @@ subdir-y := accounting auxdisplay blackfin connector \ - filesystems filesystems ia64 laptops mic misc-devices \ + filesystems filesystems ia64 kdbus laptops mic misc-devices \ networking pcmcia prctl ptp spi timers vDSO video4linux \ watchdog diff -Nur linux-4.2/Documentation/ioctl/ioctl-number.txt linux-4.2-pck/Documentation/ioctl/ioctl-number.txt --- linux-4.2/Documentation/ioctl/ioctl-number.txt 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/Documentation/ioctl/ioctl-number.txt 2015-09-08 00:48:22.208364829 -0300 @@ -292,6 +292,7 @@ 0x92 00-0F drivers/usb/mon/mon_bin.c 0x93 60-7F linux/auto_fs.h 0x94 all fs/btrfs/ioctl.h +0x95 all uapi/linux/kdbus.h kdbus IPC driver 0x97 00-7F fs/ceph/ioctl.h Ceph file system 0x99 00-0F 537-Addinboard driver diff -Nur linux-4.2/Documentation/kdbus/.gitignore linux-4.2-pck/Documentation/kdbus/.gitignore --- linux-4.2/Documentation/kdbus/.gitignore 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/.gitignore 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,2 @@ +*.7 +*.html diff -Nur linux-4.2/Documentation/kdbus/Makefile linux-4.2-pck/Documentation/kdbus/Makefile --- linux-4.2/Documentation/kdbus/Makefile 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/Makefile 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,44 @@ +DOCS := \ + kdbus.xml \ + kdbus.bus.xml \ + kdbus.connection.xml \ + kdbus.endpoint.xml \ + kdbus.fs.xml \ + kdbus.item.xml \ + kdbus.match.xml \ + kdbus.message.xml \ + kdbus.name.xml \ + kdbus.policy.xml \ + kdbus.pool.xml + +XMLFILES := $(addprefix $(obj)/,$(DOCS)) +MANFILES := $(patsubst %.xml, %.7, $(XMLFILES)) +HTMLFILES := $(patsubst %.xml, %.html, $(XMLFILES)) + +XMLTO_ARGS := -m $(srctree)/$(src)/stylesheet.xsl --skip-validation + +quiet_cmd_db2man = MAN $@ + cmd_db2man = xmlto man $(XMLTO_ARGS) -o $(obj) $< +%.7: %.xml + @(which xmlto > /dev/null 2>&1) || \ + (echo "*** You need to install xmlto ***"; \ + exit 1) + $(call cmd,db2man) + +quiet_cmd_db2html = HTML $@ + cmd_db2html = xmlto html-nochunks $(XMLTO_ARGS) -o $(obj) $< +%.html: %.xml + @(which xmlto > /dev/null 2>&1) || \ + (echo "*** You need to install xmlto ***"; \ + exit 1) + $(call cmd,db2html) + +mandocs: $(MANFILES) + +htmldocs: $(HTMLFILES) + +clean-files := $(MANFILES) $(HTMLFILES) + +# we don't support other %docs targets right now +%docs: + @true diff -Nur linux-4.2/Documentation/kdbus/kdbus.bus.xml linux-4.2-pck/Documentation/kdbus/kdbus.bus.xml --- linux-4.2/Documentation/kdbus/kdbus.bus.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.bus.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,344 @@ + + + + + + + kdbus.bus + kdbus.bus + + + + kdbus.bus + 7 + + + + kdbus.bus + kdbus bus + + + + Description + + + A bus is a resource that is shared between connections in order to + transmit messages (see + + kdbus.message + 7 + ). + Each bus is independent, and operations on the bus will not have any + effect on other buses. A bus is a management entity that controls the + addresses of its connections, their policies and message transactions + performed via this bus. + + + Each bus is bound to the mount instance it was created on. It has a + custom name that is unique across all buses of a domain. In + + kdbus.fs + 7 + + a bus is presented as a directory. No operations can be performed on + the bus itself; instead you need to perform the operations on an endpoint + associated with the bus. Endpoints are accessible as files underneath the + bus directory. A default endpoint called bus is + provided on each bus. + + + Bus names may be chosen freely except for one restriction: the name must + be prefixed with the numeric effective UID of the creator and a dash. This + is required to avoid namespace clashes between different users. When + creating a bus, the name that is passed in must be properly formatted, or + the kernel will refuse creation of the bus. Example: + 1047-foobar is an acceptable name for a bus + registered by a user with UID 1047. However, + 1024-foobar is not, and neither is + foobar. The UID must be provided in the + user-namespace of the bus owner. + + + To create a new bus, you need to open the control file of a domain and + employ the KDBUS_CMD_BUS_MAKE ioctl. The control + file descriptor that was used to issue + KDBUS_CMD_BUS_MAKE must not previously have been + used for any other control-ioctl and must be kept open for the entire + life-time of the created bus. Closing it will immediately cleanup the + entire bus and all its associated resources and endpoints. Every control + file descriptor can only be used to create a single new bus; from that + point on, it is not used for any further communication until the final + + close + 2 + + . + + + Each bus will generate a random, 128-bit UUID upon creation. This UUID + will be returned to creators of connections through + kdbus_cmd_hello.id128 and can be used to uniquely + identify buses, even across different machines or containers. The UUID + will have its variant bits set to DCE, and denote + version 4 (random). For more details on UUIDs, see + the Wikipedia article on UUIDs. + + + + + + Creating buses + + To create a new bus, the KDBUS_CMD_BUS_MAKE + command is used. It takes a struct kdbus_cmd argument. + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + The flags for creation. + + + KDBUS_MAKE_ACCESS_GROUP + + Make the bus file group-accessible. + + + + + KDBUS_MAKE_ACCESS_WORLD + + Make the bus file world-accessible. + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + The following items (see + + kdbus.item + 7 + ) + are expected for KDBUS_CMD_BUS_MAKE. + + + + KDBUS_ITEM_MAKE_NAME + + + Contains a null-terminated string that identifies the + bus. The name must be unique across the kdbus domain and + must start with the effective UID of the caller, followed by + a '-' (dash). This item is mandatory. + + + + + + KDBUS_ITEM_BLOOM_PARAMETER + + + Bus-wide bloom parameters passed in a + struct kdbus_bloom_parameter. These settings are + copied back to new connections verbatim. This item is + mandatory. See + + kdbus.item + 7 + + for a more detailed description of this item. + + + + + + KDBUS_ITEM_ATTACH_FLAGS_SEND + + + An optional item that contains a set of attach flags that are + returned to connections when they query the bus creator + metadata. If not set, no metadata is returned. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_BUS_MAKE</constant> may fail with the following + errors + + + + + EBADMSG + + A mandatory item is missing. + + + + + EINVAL + + The flags supplied in the struct kdbus_cmd + are invalid or the supplied name does not start with the current + UID and a '-' (dash). + + + + + EEXIST + + A bus of that name already exists. + + + + + ESHUTDOWN + + The kdbus mount instance for the bus was already shut down. + + + + + EMFILE + + The maximum number of buses for the current user is exhausted. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.connection.xml linux-4.2-pck/Documentation/kdbus/kdbus.connection.xml --- linux-4.2/Documentation/kdbus/kdbus.connection.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.connection.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,1244 @@ + + + + + + + kdbus.connection + kdbus.connection + + + + kdbus.connection + 7 + + + + kdbus.connection + kdbus connection + + + + Description + + + Connections are identified by their connection ID, + internally implemented as a uint64_t counter. + The IDs of every newly created bus start at 1, and + every new connection will increment the counter by 1. + The IDs are not reused. + + + In higher level tools, the user visible representation of a connection is + defined by the D-Bus protocol specification as + ":1.<ID>". + + + Messages with a specific uint64_t destination ID are + directly delivered to the connection with the corresponding ID. Signal + messages (see + + kdbus.message + 7 + ) + may be addressed to the special destination ID + KDBUS_DST_ID_BROADCAST (~0ULL) and will then + potentially be delivered to all currently active connections on the bus. + However, in order to receive any signal messages, clients must subscribe + to them by installing a match (see + + kdbus.match + 7 + ). + + + Messages synthesized and sent directly by the kernel will carry the + special source ID KDBUS_SRC_ID_KERNEL (0). + + + In addition to the unique uint64_t connection ID, + established connections can request the ownership of + well-known names, under which they can be found and + addressed by other bus clients. A well-known name is associated with one + and only one connection at a time. See + + kdbus.name + 7 + + on name acquisition, the name registry, and the validity of names. + + + Messages can specify the special destination ID + KDBUS_DST_ID_NAME (0) and carry a well-known name + in the message data. Such a message is delivered to the destination + connection which owns that well-known name. + + + | src: 22 | | | + | | | | dst: 25 | | | + | | | | | | | + | | | | | | | + | | | +---------------------------+ | | + | | | | | + | | | <--------------------------------------+ | | + | +---------------+ | | | + | | | | + | +---------------+ +---------------------------+ | | | + | | Connection | | Message | -----+ | | + | | :1.25 | --> | src: 25 | | | + | | | | dst: 0xffffffffffffffff | -------------+ | | + | | | | (KDBUS_DST_ID_BROADCAST) | | | | + | | | | | ---------+ | | | + | | | +---------------------------+ | | | | + | | | | | | | + | | | <--------------------------------------------------+ | + | +---------------+ | | | + | | | | + | +---------------+ +---------------------------+ | | | + | | Connection | | Message | --+ | | | + | | :1.55 | --> | src: 55 | | | | | + | | | | dst: 0 / org.foo.bar | | | | | + | | | | | | | | | + | | | | | | | | | + | | | +---------------------------+ | | | | + | | | | | | | + | | | <------------------------------------------+ | | + | +---------------+ | | | + | | | | + | +---------------+ | | | + | | Connection | | | | + | | :1.81 | | | | + | | org.foo.bar | | | | + | | | | | | + | | | | | | + | | | <-----------------------------------+ | | + | | | | | + | | | <----------------------------------------------+ | + | +---------------+ | + +-------------------------------------------------------------------------+ + ]]> + + + + Privileged connections + + A connection is considered privileged if the user + it was created by is the same that created the bus, or if the creating + task had CAP_IPC_OWNER set when it called + KDBUS_CMD_HELLO (see below). + + + Privileged connections have permission to employ certain restricted + functions and commands, which are explained below and in other kdbus + man-pages. + + + + + Activator and policy holder connection + + An activator connection is a placeholder for a + well-known name. Messages sent to such a connection + can be used to start an implementer connection, which will then get all + the messages from the activator copied over. An activator connection + cannot be used to send any message. + + + A policy holder connection only installs a policy + for one or more names. These policy entries are kept active as long as + the connection is alive, and are removed once it terminates. Such a + policy connection type can be used to deploy restrictions for names that + are not yet active on the bus. A policy holder connection cannot be used + to send any message. + + + The creation of activator or policy holder connections is restricted to + privileged users on the bus (see above). + + + + + Monitor connections + + Monitors are eavesdropping connections that receive all the traffic on the + bus, but is invisible to other connections. Such connections have all + properties of any other, regular connection, except for the following + details: + + + + + They will get every message sent over the bus, both unicasts and + broadcasts. + + + + Installing matches for signal messages is neither necessary + nor allowed. + + + + They cannot send messages or be directly addressed as receiver. + + + + They cannot own well-known names. Therefore, they also can't operate as + activators. + + + + Their creation and destruction will not cause + KDBUS_ITEM_ID_{ADD,REMOVE} (see + + kdbus.item + 7 + ). + + + + They are not listed with their unique name in name registry dumps + (see KDBUS_CMD_NAME_LIST in + + kdbus.name + 7 + ), so other connections cannot detect the presence of + a monitor. + + + + The creation of monitor connections is restricted to privileged users on + the bus (see above). + + + + + Creating connections + + A connection to a bus is created by opening an endpoint file (see + + kdbus.endpoint + 7 + ) + of a bus and becoming an active client with the + KDBUS_CMD_HELLO ioctl. Every connection has a unique + identifier on the bus and can address messages to every other connection + on the same bus by using the peer's connection ID as the destination. + + + The KDBUS_CMD_HELLO ioctl takes a struct + kdbus_cmd_hello as argument. + + + +struct kdbus_cmd_hello { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 attach_flags_send; + __u64 attach_flags_recv; + __u64 bus_flags; + __u64 id; + __u64 pool_size; + __u64 offset; + __u8 id128[16]; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Flags to apply to this connection + + + KDBUS_HELLO_ACCEPT_FD + + + When this flag is set, the connection can be sent file + descriptors as message payload of unicast messages. If it's + not set, an attempt to send file descriptors will result in + -ECOMM on the sender's side. + + + + + + KDBUS_HELLO_ACTIVATOR + + + Make this connection an activator (see above). With this bit + set, an item of type KDBUS_ITEM_NAME has + to be attached. This item describes the well-known name this + connection should be an activator for. + A connection can not be an activator and a policy holder at + the same time time, so this bit is not allowed together with + KDBUS_HELLO_POLICY_HOLDER. + + + + + + KDBUS_HELLO_POLICY_HOLDER + + + Make this connection a policy holder (see above). With this + bit set, an item of type KDBUS_ITEM_NAME + has to be attached. This item describes the well-known name + this connection should hold a policy for. + A connection can not be an activator and a policy holder at + the same time time, so this bit is not allowed together with + KDBUS_HELLO_ACTIVATOR. + + + + + + KDBUS_HELLO_MONITOR + + + Make this connection a monitor connection (see above). + + + This flag can only be set by privileged bus connections. See + below for more information. + A connection can not be monitor and an activator or a policy + holder at the same time time, so this bit is not allowed + together with KDBUS_HELLO_ACTIVATOR or + KDBUS_HELLO_POLICY_HOLDER. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + attach_flags_send + + Set the bits for metadata this connection permits to be sent to the + receiving peer. Only metadata items that are both allowed to be sent + by the sender and that are requested by the receiver will be attached + to the message. + + + + + attach_flags_recv + + Request the attachment of metadata for each message received by this + connection. See + + kdbus + 7 + + for information about metadata, and + + kdbus.item + 7 + + regarding items in general. + + + + + bus_flags + + Upon successful completion of the ioctl, this member will contain the + flags of the bus it connected to. + + + + + id + + Upon successful completion of the command, this member will contain + the numerical ID of the new connection. + + + + + pool_size + + The size of the communication pool, in bytes. The pool can be + accessed by calling + + mmap + 2 + + on the file descriptor that was used to issue the + KDBUS_CMD_HELLO ioctl. + The pool size of a connection must be greater than + 0 and a multiple of + PAGE_SIZE. See + + kdbus.pool + 7 + + for more information. + + + + + offset + + The kernel will return the offset in the pool where returned details + will be stored. See below. + + + + + id128 + + Upon successful completion of the ioctl, this member will contain the + 128-bit UUID of the connected bus. + + + + + items + + + Variable list of items containing optional additional information. + The following items are currently expected/valid: + + + + KDBUS_ITEM_CONN_DESCRIPTION + + + Contains a string that describes this connection, so it can + be identified later. + + + + + + KDBUS_ITEM_NAME + KDBUS_ITEM_POLICY_ACCESS + + + For activators and policy holders only, combinations of + these two items describe policy access entries. See + + kdbus.policy + 7 + + for further details. + + + + + + KDBUS_ITEM_CREDS + KDBUS_ITEM_PIDS + KDBUS_ITEM_SECLABEL + + + Privileged bus users may submit these types in order to + create connections with faked credentials. This information + will be returned when peer information is queried by + KDBUS_CMD_CONN_INFO. See below for more + information on retrieving information on connections. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + At the offset returned in the offset field of + struct kdbus_cmd_hello, the kernel will store items + of the following types: + + + + + KDBUS_ITEM_BLOOM_PARAMETER + + + Bloom filter parameter as defined by the bus creator. + + + + + + + The offset in the pool has to be freed with the + KDBUS_CMD_FREE ioctl. See + + kdbus.pool + 7 + + for further information. + + + + + Retrieving information on a connection + + The KDBUS_CMD_CONN_INFO ioctl can be used to + retrieve credentials and properties of the initial creator of a + connection. This ioctl uses the following struct. + + + +struct kdbus_cmd_info { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 id; + __u64 attach_flags; + __u64 offset; + __u64 info_size; + struct kdbus_item items[0]; +}; + + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Currently, no flags are supported. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will return 0, + and the flags field is set to + 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + id + + The numerical ID of the connection for which information is to be + retrieved. If set to a non-zero value, the + KDBUS_ITEM_OWNED_NAME item is ignored. + + + + + attach_flags + + Specifies which metadata items should be attached to the answer. See + + kdbus.message + 7 + . + + + + + offset + + When the ioctl returns, this field will contain the offset of the + connection information inside the caller's pool. See + + kdbus.pool + 7 + + for further information. + + + + + info_size + + The kernel will return the size of the returned information, so + applications can optionally + + mmap + 2 + + specific parts of the pool. See + + kdbus.pool + 7 + + for further information. + + + + + items + + + The following items are expected for + KDBUS_CMD_CONN_INFO. + + + + KDBUS_ITEM_OWNED_NAME + + + Contains the well-known name of the connection to look up as. + This item is mandatory if the id field is + set to 0. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + When the ioctl returns, the following struct will be stored in the + caller's pool at offset. The fields in this struct + are described below. + + + +struct kdbus_info { + __u64 size; + __u64 id; + __u64 flags; + struct kdbus_item items[0]; +}; + + + + + size + + The overall size of the struct, including its items. + + + + + id + + The connection's unique ID. + + + + + flags + + The connection's flags as specified when it was created. + + + + + items + + + Depending on the flags field in + struct kdbus_cmd_info, items of types + KDBUS_ITEM_OWNED_NAME and + KDBUS_ITEM_CONN_DESCRIPTION may follow here. + KDBUS_ITEM_NEGOTIATE is also allowed. + + + + + + + Once the caller is finished with parsing the return buffer, it needs to + employ the KDBUS_CMD_FREE command for the offset, in + order to free the buffer part. See + + kdbus.pool + 7 + + for further information. + + + + + Getting information about a connection's bus creator + + The KDBUS_CMD_BUS_CREATOR_INFO ioctl takes the same + struct as KDBUS_CMD_CONN_INFO, but is used to + retrieve information about the creator of the bus the connection is + attached to. The metadata returned by this call is collected during the + creation of the bus and is never altered afterwards, so it provides + pristine information on the task that created the bus, at the moment when + it did so. + + + In response to this call, a slice in the connection's pool is allocated + and filled with an object of type struct kdbus_info, + pointed to by the ioctl's offset field. + + + +struct kdbus_info { + __u64 size; + __u64 id; + __u64 flags; + struct kdbus_item items[0]; +}; + + + + + size + + The overall size of the struct, including its items. + + + + + id + + The bus ID. + + + + + flags + + The bus flags as specified when it was created. + + + + + items + + + Metadata information is stored in items here. The item list + contains a KDBUS_ITEM_MAKE_NAME item that + indicates the bus name of the calling connection. + KDBUS_ITEM_NEGOTIATE is allowed to probe + for known item types. + + + + + + + Once the caller is finished with parsing the return buffer, it needs to + employ the KDBUS_CMD_FREE command for the offset, in + order to free the buffer part. See + + kdbus.pool + 7 + + for further information. + + + + + Updating connection details + + Some of a connection's details can be updated with the + KDBUS_CMD_CONN_UPDATE ioctl, using the file + descriptor that was used to create the connection. The update command + uses the following struct. + + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Currently, no flags are supported. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will return 0, + and the flags field is set to + 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + Items to describe the connection details to be updated. The + following item types are supported. + + + + KDBUS_ITEM_ATTACH_FLAGS_SEND + + + Supply a new set of metadata items that this connection + permits to be sent along with messages. + + + + + + KDBUS_ITEM_ATTACH_FLAGS_RECV + + + Supply a new set of metadata items that this connection + requests to be attached to each message. + + + + + + KDBUS_ITEM_NAME + KDBUS_ITEM_POLICY_ACCESS + + + Policy holder connections may supply a new set of policy + information with these items. For other connection types, + EOPNOTSUPP is returned in + errno. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + + Termination of connections + + A connection can be terminated by simply calling + + close + 2 + + on its file descriptor. All pending incoming messages will be discarded, + and the memory allocated by the pool will be freed. + + + + An alternative way of closing down a connection is via the + KDBUS_CMD_BYEBYE ioctl. This ioctl will succeed only + if the message queue of the connection is empty at the time of closing; + otherwise, the ioctl will fail with errno set to + EBUSY. When this ioctl returns + successfully, the connection has been terminated and won't accept any new + messages from remote peers. This way, a connection can be terminated + race-free, without losing any messages. The ioctl takes an argument of + type struct kdbus_cmd. + + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Currently, no flags are supported. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will fail with + errno set to EPROTO, and + the flags field is set to 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + The following item types are supported. + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_HELLO</constant> may fail with the following + errors + + + + + EFAULT + + The supplied pool size was 0 or not a multiple of the page size. + + + + + EINVAL + + The flags supplied in struct kdbus_cmd_hello + are invalid. + + + + + EINVAL + + An illegal combination of + KDBUS_HELLO_MONITOR, + KDBUS_HELLO_ACTIVATOR and + KDBUS_HELLO_POLICY_HOLDER was passed in + flags. + + + + + EINVAL + + An invalid set of items was supplied. + + + + + ECONNREFUSED + + The attach_flags_send field did not satisfy the requirements of + the bus. + + + + + EPERM + + A KDBUS_ITEM_CREDS items was supplied, but the + current user is not privileged. + + + + + ESHUTDOWN + + The bus you were trying to connect to has already been shut down. + + + + + EMFILE + + The maximum number of connections on the bus has been reached. + + + + + EOPNOTSUPP + + The endpoint does not support the connection flags supplied in + struct kdbus_cmd_hello. + + + + + + + + <constant>KDBUS_CMD_BYEBYE</constant> may fail with the following + errors + + + + + EALREADY + + The connection has already been shut down. + + + + + EBUSY + + There are still messages queued up in the connection's pool. + + + + + + + + <constant>KDBUS_CMD_CONN_INFO</constant> may fail with the following + errors + + + + + EINVAL + + Invalid flags, or neither an ID nor a name was provided, or the + name is invalid. + + + + + ESRCH + + Connection lookup by name failed. + + + + + ENXIO + + No connection with the provided connection ID found. + + + + + + + + <constant>KDBUS_CMD_CONN_UPDATE</constant> may fail with the following + errors + + + + + EINVAL + + Illegal flags or items. + + + + + EINVAL + + Wildcards submitted in policy entries, or illegal sequence + of policy items. + + + + + EOPNOTSUPP + + Operation not supported by connection. + + + + + E2BIG + + Too many policy items attached. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.policy + 7 + + + + + kdbus.pool + 7 + + + + + kdbus.item + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.endpoint.xml linux-4.2-pck/Documentation/kdbus/kdbus.endpoint.xml --- linux-4.2/Documentation/kdbus/kdbus.endpoint.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.endpoint.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,429 @@ + + + + + + + kdbus.endpoint + kdbus.endpoint + + + + kdbus.endpoint + 7 + + + + kdbus.endpoint + kdbus endpoint + + + + Description + + + Endpoints are entry points to a bus (see + + kdbus.bus + 7 + ). + By default, each bus has a default + endpoint called 'bus'. The bus owner has the ability to create custom + endpoints with specific names, permissions, and policy databases + (see below). An endpoint is presented as file underneath the directory + of the parent bus. + + + To create a custom endpoint, open the default endpoint + (bus) and use the + KDBUS_CMD_ENDPOINT_MAKE ioctl with + struct kdbus_cmd. Custom endpoints always have a policy + database that, by default, forbids any operation. You have to explicitly + install policy entries to allow any operation on this endpoint. + + + Once KDBUS_CMD_ENDPOINT_MAKE succeeded, the new + endpoint will appear in the filesystem + ( + kdbus.bus + 7 + ), and the used file descriptor will manage the + newly created endpoint resource. It cannot be used to manage further + resources and must be kept open as long as the endpoint is needed. The + endpoint will be terminated as soon as the file descriptor is closed. + + + Endpoint names may be chosen freely except for one restriction: the name + must be prefixed with the numeric effective UID of the creator and a dash. + This is required to avoid namespace clashes between different users. When + creating an endpoint, the name that is passed in must be properly + formatted or the kernel will refuse creation of the endpoint. Example: + 1047-my-endpoint is an acceptable name for an + endpoint registered by a user with UID 1047. However, + 1024-my-endpoint is not, and neither is + my-endpoint. The UID must be provided in the + user-namespace of the bus. + + + To create connections to a bus, use KDBUS_CMD_HELLO + on a file descriptor returned by open() on an + endpoint node. See + + kdbus.connection + 7 + + for further details. + + + + + Creating custom endpoints + + To create a new endpoint, the + KDBUS_CMD_ENDPOINT_MAKE command is used. Along with + the endpoint's name, which will be used to expose the endpoint in the + + kdbus.fs + 7 + , + the command also optionally takes items to set up the endpoint's + + kdbus.policy + 7 + . + KDBUS_CMD_ENDPOINT_MAKE takes a + struct kdbus_cmd argument. + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + The flags for creation. + + + KDBUS_MAKE_ACCESS_GROUP + + Make the endpoint file group-accessible. + + + + + KDBUS_MAKE_ACCESS_WORLD + + Make the endpoint file world-accessible. + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + The following items are expected for + KDBUS_CMD_ENDPOINT_MAKE. + + + + KDBUS_ITEM_MAKE_NAME + + Contains a string to identify the endpoint name. + + + + + KDBUS_ITEM_NAME + KDBUS_ITEM_POLICY_ACCESS + + + These items are used to set the policy attached to the + endpoint. For more details on bus and endpoint policies, see + + kdbus.policy + 7 + . + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + + Updating endpoints + + To update an existing endpoint, the + KDBUS_CMD_ENDPOINT_UPDATE command is used on the file + descriptor that was used to create the endpoint, using + KDBUS_CMD_ENDPOINT_MAKE. The only relevant detail of + the endpoint that can be updated is the policy. When the command is + employed, the policy of the endpoint is replaced + atomically with the new set of rules. + The command takes a struct kdbus_cmd argument. + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Unused for this command. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will return 0, + and the flags field is set to + 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + The following items are expected for + KDBUS_CMD_ENDPOINT_UPDATE. + + + + KDBUS_ITEM_NAME + KDBUS_ITEM_POLICY_ACCESS + + + These items are used to set the policy attached to the + endpoint. For more details on bus and endpoint policies, see + + kdbus.policy + 7 + . + Existing policy is atomically replaced with the new rules + provided. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_ENDPOINT_MAKE</constant> may fail with the + following errors + + + + + EINVAL + + The flags supplied in the struct kdbus_cmd + are invalid. + + + + + EINVAL + + Illegal combination of KDBUS_ITEM_NAME and + KDBUS_ITEM_POLICY_ACCESS was provided. + + + + + EEXIST + + An endpoint of that name already exists. + + + + + EPERM + + The calling user is not privileged. See + + kdbus + 7 + + for information about privileged users. + + + + + + + + <constant>KDBUS_CMD_ENDPOINT_UPDATE</constant> may fail with the + following errors + + + + + EINVAL + + The flags supplied in struct kdbus_cmd + are invalid. + + + + + EINVAL + + Illegal combination of KDBUS_ITEM_NAME and + KDBUS_ITEM_POLICY_ACCESS was provided. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.fs.xml linux-4.2-pck/Documentation/kdbus/kdbus.fs.xml --- linux-4.2/Documentation/kdbus/kdbus.fs.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.fs.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,124 @@ + + + + + + + kdbus.fs + kdbus.fs + + + + kdbus.fs + 7 + + + + kdbus.fs + kdbus file system + + + + File-system Layout + + + The kdbusfs pseudo filesystem provides access to + kdbus entities, such as buses and + endpoints. Each time the filesystem is mounted, + a new, isolated kdbus instance is created, which is independent from the + other instances. + + + The system-wide standard mount point for kdbusfs is + /sys/fs/kdbus. + + + + Buses are represented as directories in the file system layout, whereas + endpoints are exposed as files inside these directories. At the top-level, + a control node is present, which can be opened to + create new buses via the KDBUS_CMD_BUS_MAKE ioctl. + Each bus shows a default endpoint called + bus, which can be opened to either create a connection + with the KDBUS_CMD_HELLO ioctl, or to create new + custom endpoints for the bus with + KDBUS_CMD_ENDPOINT_MAKE. See + + kdbus.bus + 7 + , + + kdbus.connection + 7 + and + + kdbus.endpoint + 7 + + for more details. + + + Following, you can see an example layout of the + kdbusfs filesystem: + + + /sys/fs/kdbus/ ; mount-point + |-- 0-system ; bus directory + | |-- bus ; default endpoint + | `-- 1017-custom ; custom endpoint + |-- 1000-user ; bus directory + | |-- bus ; default endpoint + | |-- 1000-service-A ; custom endpoint + | `-- 1000-service-B ; custom endpoint + `-- control ; control file + + + + + Mounting instances + + In order to get a new and separate kdbus environment, a new instance + of kdbusfs can be mounted like this: + + + # mount -t kdbusfs kdbusfs /tmp/new_kdbus/ + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + mount + 8 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.item.xml linux-4.2-pck/Documentation/kdbus/kdbus.item.xml --- linux-4.2/Documentation/kdbus/kdbus.item.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.item.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,839 @@ + + + + + + + kdbus.item + kdbus item + + + + kdbus.item + 7 + + + + kdbus.item + kdbus item structure, layout and usage + + + + Description + + + To flexibly augment transport structures, data blobs of type + struct kdbus_item can be attached to the structs passed + into the ioctls. Some ioctls make items of certain types mandatory, + others are optional. Items that are unsupported by ioctls they are + attached to will cause the ioctl to fail with errno + set to EINVAL. + Items are also used for information stored in a connection's + pool, such as received messages, name lists or + requested connection or bus owner information. Depending on the type of + an item, its total size is either fixed or variable. + + + + Chaining items + + Whenever items are used as part of the kdbus kernel API, they are + embedded in structs that are embedded inside structs that themselves + include a size field containing the overall size of the structure. + This allows multiple items to be chained up, and an item iterator + (see below) is capable of detecting the end of an item chain. + + + + + Alignment + + The kernel expects all items to be aligned to 8-byte boundaries. + Unaligned items will cause the ioctl they are used with to fail + with errno set to EINVAL. + An item that has an unaligned size itself hence needs to be padded + if it is followed by another item. + + + + + Iterating items + + A simple iterator would iterate over the items until the items have + reached the embedding structure's overall size. An example + implementation is shown below. + + + size)) + +#define KDBUS_ITEM_FOREACH(item, head, first) \ + for ((item) = (head)->first; \ + ((uint8_t *)(item) < (uint8_t *)(head) + (head)->size) && \ + ((uint8_t *)(item) >= (uint8_t *)(head)); \ + (item) = KDBUS_ITEM_NEXT(item)) + ]]> + + + + + Item layout + + A struct kdbus_item consists of a + size field, describing its overall size, and a + type field, both 64 bit wide. They are followed by + a union to store information that is specific to the item's type. + The struct layout is shown below. + + + +struct kdbus_item { + __u64 size; + __u64 type; + /* item payload - see below */ + union { + __u8 data[0]; + __u32 data32[0]; + __u64 data64[0]; + char str[0]; + + __u64 id; + struct kdbus_vec vec; + struct kdbus_creds creds; + struct kdbus_pids pids; + struct kdbus_audit audit; + struct kdbus_caps caps; + struct kdbus_timestamp timestamp; + struct kdbus_name name; + struct kdbus_bloom_parameter bloom_parameter; + struct kdbus_bloom_filter bloom_filter; + struct kdbus_memfd memfd; + int fds[0]; + struct kdbus_notify_name_change name_change; + struct kdbus_notify_id_change id_change; + struct kdbus_policy_access policy_access; + }; +}; + + + + struct kdbus_item should never be used to allocate + an item instance, as its size may grow in future releases of the API. + Instead, it should be manually assembled by storing the + size, type and payload to a + struct of its own. + + + + + Item types + + + Negotiation item + + + KDBUS_ITEM_NEGOTIATE + + With this item is attached to any ioctl, programs can + probe the kernel for known item types. + The item carries an array of uint64_t values in + item.data64, each set to an item type to + probe. The kernel will reset each member of this array that is + not recognized as valid item type to 0. + This way, users can negotiate kernel features at start-up to + keep newer userspace compatible with older kernels. This item + is never attached by the kernel in response to any command. + + + + + + + Command specific items + + + KDBUS_ITEM_PAYLOAD_VEC + KDBUS_ITEM_PAYLOAD_OFF + + Messages are directly copied by the sending process into the + receiver's + + kdbus.pool + 7 + . + This way, two peers can exchange data by effectively doing a + single-copy from one process to another; the kernel will not buffer + the data anywhere else. KDBUS_ITEM_PAYLOAD_VEC + is used when sending message. The item + references a memory address when the payload data can be found. + KDBUS_ITEM_PAYLOAD_OFF is used when messages + are received, and the + offset value describes the offset inside the + receiving connection's + + kdbus.pool + 7 + + where the message payload can be found. See + + kdbus.message + 7 + + for more information on passing of payload data along with a + message. + +struct kdbus_vec { + __u64 size; + union { + __u64 address; + __u64 offset; + }; +}; + + + + + + KDBUS_ITEM_PAYLOAD_MEMFD + + Transports a file descriptor of a memfd in + struct kdbus_memfd in item.memfd. + The size field has to match the actual size of + the memfd that was specified when it was created. The + start parameter denotes the offset inside the + memfd at which the referenced payload starts. See + + kdbus.message + 7 + + for more information on passing of payload data along with a + message. + +struct kdbus_memfd { + __u64 start; + __u64 size; + int fd; + __u32 __pad; +}; + + + + + + KDBUS_ITEM_FDS + + Contains an array of file descriptors. + When used with KDBUS_CMD_SEND, the values of + this array must be filled with valid file descriptor numbers. + When received as item attached to a message, the array will + contain the numbers of the installed file descriptors, or + -1 in case an error occurred. + In either case, the number of entries in the array is derived from + the item's total size. See + + kdbus.message + 7 + + for more information. + + + + + + + Items specific to some commands + + + KDBUS_ITEM_CANCEL_FD + + Transports a file descriptor that can be used to cancel a + synchronous KDBUS_CMD_SEND operation by + writing to it. The file descriptor is stored in + item.fd[0]. The item may only contain one + file descriptor. See + + kdbus.message + 7 + + for more information on this item and how to use it. + + + + + KDBUS_ITEM_BLOOM_PARAMETER + + Contains a set of bloom parameters as + struct kdbus_bloom_parameter in + item.bloom_parameter. + The item is passed from userspace to kernel during the + KDBUS_CMD_BUS_MAKE ioctl, and returned + verbatim when KDBUS_CMD_HELLO is called. + The kernel does not use the bloom parameters, but they need to + be known by each connection on the bus in order to define the + bloom filter hash details. See + + kdbus.match + 7 + + for more information on matching and bloom filters. + +struct kdbus_bloom_parameter { + __u64 size; + __u64 n_hash; +}; + + + + + + KDBUS_ITEM_BLOOM_FILTER + + Carries a bloom filter as + struct kdbus_bloom_filter in + item.bloom_filter. It is mandatory to send this + item attached to a struct kdbus_msg, in case the + message is a signal. This item is never transported from kernel to + userspace. See + + kdbus.match + 7 + + for more information on matching and bloom filters. + +struct kdbus_bloom_filter { + __u64 generation; + __u64 data[0]; +}; + + + + + + KDBUS_ITEM_BLOOM_MASK + + Transports a bloom mask as binary data blob + stored in item.data. This item is used to + describe a match into a connection's match database. See + + kdbus.match + 7 + + for more information on matching and bloom filters. + + + + + KDBUS_ITEM_DST_NAME + + Contains a well-known name to send a + message to, as null-terminated string in + item.str. This item is used with + KDBUS_CMD_SEND. See + + kdbus.message + 7 + + for more information on how to send a message. + + + + + KDBUS_ITEM_MAKE_NAME + + Contains a bus name or + endpoint name, stored as null-terminated + string in item.str. This item is sent from + userspace to kernel when buses or endpoints are created, and + returned back to userspace when the bus creator information is + queried. See + + kdbus.bus + 7 + + and + + kdbus.endpoint + 7 + . + + + + + KDBUS_ITEM_ATTACH_FLAGS_SEND + KDBUS_ITEM_ATTACH_FLAGS_RECV + + Contains a set of attach flags at + send or receive time. See + + kdbus + 7 + , + + kdbus.bus + 7 + and + + kdbus.connection + 7 + + for more information on attach flags. + + + + + KDBUS_ITEM_ID + + Transports a connection's numerical ID of + a connection as uint64_t value in + item.id. + + + + + KDBUS_ITEM_NAME + + Transports a name associated with the + name registry as null-terminated string as + struct kdbus_name in + item.name. The flags + contains the flags of the name. See + + kdbus.name + 7 + + for more information on how to access the name registry of a bus. + +struct kdbus_name { + __u64 flags; + char name[0]; +}; + + + + + + + + Items attached by the kernel as metadata + + + + KDBUS_ITEM_TIMESTAMP + + Contains both the monotonic and the + realtime timestamp, taken when the message + was processed on the kernel side. + Stored as struct kdbus_timestamp in + item.timestamp. + +struct kdbus_timestamp { + __u64 seqnum; + __u64 monotonic_ns; + __u64 realtime_ns; +}; + + + + + + KDBUS_ITEM_CREDS + + Contains a set of user and + group information as 32-bit values, in the + usual four flavors: real, effective, saved and filesystem related. + Stored as struct kdbus_creds in + item.creds. + +struct kdbus_creds { + __u32 uid; + __u32 euid; + __u32 suid; + __u32 fsuid; + __u32 gid; + __u32 egid; + __u32 sgid; + __u32 fsgid; +}; + + + + + + KDBUS_ITEM_PIDS + + Contains the PID, TID + and parent PID (PPID) of a remote peer. + Stored as struct kdbus_pids in + item.pids. + +struct kdbus_pids { + __u64 pid; + __u64 tid; + __u64 ppid; +}; + + + + + + KDBUS_ITEM_AUXGROUPS + + Contains the auxiliary (supplementary) groups + a remote peer is a member of, stored as array of + uint32_t values in item.data32. + The array length can be determined by looking at the item's total + size, subtracting the size of the header and dividing the + remainder by sizeof(uint32_t). + + + + + KDBUS_ITEM_OWNED_NAME + + Contains a well-known name currently owned + by a connection. The name is stored as null-terminated string in + item.str. Its length can also be derived from + the item's total size. + + + + + KDBUS_ITEM_TID_COMM [*] + + Contains the comm string of a task's + TID (thread ID), stored as null-terminated + string in item.str. Its length can also be + derived from the item's total size. Receivers of this item should + not use its contents for any kind of security measures. See below. + + + + + KDBUS_ITEM_PID_COMM [*] + + Contains the comm string of a task's + PID (process ID), stored as null-terminated + string in item.str. Its length can also be + derived from the item's total size. Receivers of this item should + not use its contents for any kind of security measures. See below. + + + + + KDBUS_ITEM_EXE [*] + + Contains the path to the executable of a task, + stored as null-terminated string in item.str. Its + length can also be derived from the item's total size. Receivers of + this item should not use its contents for any kind of security + measures. See below. + + + + + KDBUS_ITEM_CMDLINE [*] + + Contains the command line arguments of a + task, stored as an array of null-terminated + strings in item.str. The total length of all + strings in the array can be derived from the item's total size. + Receivers of this item should not use its contents for any kind + of security measures. See below. + + + + + KDBUS_ITEM_CGROUP + + Contains the cgroup path of a task, stored + as null-terminated string in item.str. Its + length can also be derived from the item's total size. + + + + + KDBUS_ITEM_CAPS + + Contains sets of capabilities, stored as + struct kdbus_caps in item.caps. + As the item size may increase in the future, programs should be + written in a way that it takes + item.caps.last_cap into account, and derive + the number of sets and rows from the item size and the reported + number of valid capability bits. + +struct kdbus_caps { + __u32 last_cap; + __u32 caps[0]; +}; + + + + + + KDBUS_ITEM_SECLABEL + + Contains the LSM label of a task, stored as + null-terminated string in item.str. Its length + can also be derived from the item's total size. + + + + + KDBUS_ITEM_AUDIT + + Contains the audit sessionid and + loginuid of a task, stored as + struct kdbus_audit in + item.audit. + +struct kdbus_audit { + __u32 sessionid; + __u32 loginuid; +}; + + + + + + KDBUS_ITEM_CONN_DESCRIPTION + + Contains the connection description, as set + by KDBUS_CMD_HELLO or + KDBUS_CMD_CONN_UPDATE, stored as + null-terminated string in item.str. Its length + can also be derived from the item's total size. + + + + + + All metadata is automatically translated into the + namespaces of the task that receives them. See + + kdbus.message + 7 + + for more information. + + + + [*] Note that the content stored in metadata items of type + KDBUS_ITEM_TID_COMM, + KDBUS_ITEM_PID_COMM, + KDBUS_ITEM_EXE and + KDBUS_ITEM_CMDLINE + can easily be tampered by the sending tasks. Therefore, they should + not be used for any sort of security relevant + assumptions. The only reason they are transmitted is to let + receivers know about details that were set when metadata was + collected, even though the task they were collected from is not + active any longer when the items are received. + + + + + Items used for policy entries, matches and notifications + + + + KDBUS_ITEM_POLICY_ACCESS + + This item describes a policy access entry to + access the policy database of a + + kdbus.bus + 7 + or + + kdbus.endpoint + 7 + . + Please refer to + + kdbus.policy + 7 + + for more information on the policy database and how to access it. + +struct kdbus_policy_access { + __u64 type; + __u64 access; + __u64 id; +}; + + + + + + KDBUS_ITEM_ID_ADD + KDBUS_ITEM_ID_REMOVE + + This item is sent as attachment to a + kernel notification and indicates that a + new connection was created on the bus, or that a connection was + disconnected, respectively. It stores a + struct kdbus_notify_id_change in + item.id_change. + The id field contains the numeric ID of the + connection that was added or removed, and flags + is set to the connection flags, as passed by + KDBUS_CMD_HELLO. See + + kdbus.match + 7 + + and + + kdbus.message + 7 + + for more information on matches and notification messages. + +struct kdbus_notify_id_change { + __u64 id; + __u64 flags; +}; + + + + + + KDBUS_ITEM_NAME_ADD + KDBUS_ITEM_NAME_REMOVE + KDBUS_ITEM_NAME_CHANGE + + This item is sent as attachment to a + kernel notification and indicates that a + well-known name appeared, disappeared or + transferred to another owner on the bus. It stores a + struct kdbus_notify_name_change in + item.name_change. + old_id describes the former owner of the name + and is set to 0 values in case of + KDBUS_ITEM_NAME_ADD. + new_id describes the new owner of the name and + is set to 0 values in case of + KDBUS_ITEM_NAME_REMOVE. + The name field contains the well-known name the + notification is about, as null-terminated string. See + + kdbus.match + 7 + + and + + kdbus.message + 7 + + for more information on matches and notification messages. + +struct kdbus_notify_name_change { + struct kdbus_notify_id_change old_id; + struct kdbus_notify_id_change new_id; + char name[0]; +}; + + + + + + KDBUS_ITEM_REPLY_TIMEOUT + + This item is sent as attachment to a + kernel notification. It informs the receiver + that an expected reply to a message was not received in time. + The remote peer ID and the message cookie are stored in the message + header. See + + kdbus.message + 7 + + for more information about messages, timeouts and notifications. + + + + + KDBUS_ITEM_REPLY_DEAD + + This item is sent as attachment to a + kernel notification. It informs the receiver + that a remote connection a reply is expected from was disconnected + before that reply was sent. The remote peer ID and the message + cookie are stored in the message header. See + + kdbus.message + 7 + + for more information about messages, timeouts and notifications. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + memfd_create + 2 + + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.match.xml linux-4.2-pck/Documentation/kdbus/kdbus.match.xml --- linux-4.2/Documentation/kdbus/kdbus.match.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.match.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,555 @@ + + + + + + + kdbus.match + kdbus.match + + + + kdbus.match + 7 + + + + kdbus.match + kdbus match + + + + Description + + + kdbus connections can install matches in order to subscribe to signal + messages sent on the bus. Such signal messages can be either directed + to a single connection (by setting a specific connection ID in + struct kdbus_msg.dst_id or by sending it to a + well-known name), or to potentially all currently + active connections on the bus (by setting + struct kdbus_msg.dst_id to + KDBUS_DST_ID_BROADCAST). + A signal message always has the KDBUS_MSG_SIGNAL + bit set in the flags bitfield. + Also, signal messages can originate from either the kernel (called + notifications), or from other bus connections. + In either case, a bus connection needs to have a suitable + match installed in order to receive any signal + message. Without any rules installed in the connection, no signal message + will be received. + + + + + Matches for signal messages from other connections + + Matches for messages from other connections (not kernel notifications) + are implemented as bloom filters (see below). The sender adds certain + properties of the message as elements to a bloom filter bit field, and + sends that along with the signal message. + + The receiving connection adds the message properties it is interested in + as elements to a bloom mask bit field, and uploads the mask as match rule, + possibly along with some other rules to further limit the match. + + The kernel will match the signal message's bloom filter against the + connection's bloom mask (simply by &-ing it), and will decide whether + the message should be delivered to a connection. + + + The kernel has no notion of any specific properties of the signal message, + all it sees are the bit fields of the bloom filter and the mask to match + against. The use of bloom filters allows simple and efficient matching, + without exposing any message properties or internals to the kernel side. + Clients need to deal with the fact that they might receive signal messages + which they did not subscribe to, as the bloom filter might allow + false-positives to pass the filter. + + To allow the future extension of the set of elements in the bloom filter, + the filter specifies a generation number. A later + generation must always contain all elements of the set of the previous + generation, but can add new elements to the set. The match rules mask can + carry an array with all previous generations of masks individually stored. + When the filter and mask are matched by the kernel, the mask with the + closest matching generation is selected as the index into the mask array. + + + + + Bloom filters + + Bloom filters allow checking whether a given word is present in a + dictionary. This allows connections to set up a mask for information it + is interested in, and will be delivered signal messages that have a + matching filter. + + For general information, see + the Wikipedia + article on bloom filters. + + + The size of the bloom filter is defined per bus when it is created, in + kdbus_bloom_parameter.size. All bloom filters attached + to signal messages on the bus must match this size, and all bloom filter + matches uploaded by connections must also match the size, or a multiple + thereof (see below). + + The calculation of the mask has to be done in userspace applications. The + kernel just checks the bitmasks to decide whether or not to let the + message pass. All bits in the mask must match the filter in and bit-wise + AND logic, but the mask may have more bits set than + the filter. Consequently, false positive matches are expected to happen, + and programs must deal with that fact by checking the contents of the + payload again at receive time. + + + Masks are entities that are always passed to the kernel as part of a + match (with an item of type KDBUS_ITEM_BLOOM_MASK), + and filters can be attached to signals, with an item of type + KDBUS_ITEM_BLOOM_FILTER. For a filter to match, all + its bits have to be set in the match mask as well. + + + For example, consider a bus that has a bloom size of 8 bytes, and the + following mask/filter combinations: + + matches + + filter 0x0303030303030303 + mask 0x0101010101010101 + -> doesn't match + + filter 0x0101010101010101 + mask 0x0303030303030303 + -> matches + ]]> + + + Hence, in order to catch all messages, a mask filled with + 0xff bytes can be installed as a wildcard match rule. + + + + Generations + + + Uploaded matches may contain multiple masks, which have to be as large + as the bloom filter size defined by the bus. Each block of a mask is + called a generation, starting at index 0. + + At match time, when a signal is about to be delivered, a bloom mask + generation is passed, which denotes which of the bloom masks the filter + should be matched against. This allows programs to provide backward + compatible masks at upload time, while older clients can still match + against older versions of filters. + + + + + + Matches for kernel notifications + + To receive kernel generated notifications (see + + kdbus.message + 7 + ), + a connection must install match rules that are different from + the bloom filter matches described in the section above. They can be + filtered by the connection ID that caused the notification to be sent, by + one of the names it currently owns, or by the type of the notification + (ID/name add/remove/change). + + + + + Adding a match + + To add a match, the KDBUS_CMD_MATCH_ADD ioctl is + used, which takes a struct kdbus_cmd_match as an argument + described below. + + Note that each of the items attached to this command will internally + create one match rule, and the collection of them, + which is submitted as one block via the ioctl, is called a + match. To allow a message to pass, all rules of a + match have to be satisfied. Hence, adding more items to the command will + only narrow the possibility of a match to effectively let the message + pass, and will decrease the chance that the connection's process will be + woken up needlessly. + + Multiple matches can be installed per connection. As long as one of it has + a set of rules which allows the message to pass, this one will be + decisive. + + + +struct kdbus_cmd_match { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 cookie; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + Flags to control the behavior of the ioctl. + + + KDBUS_MATCH_REPLACE + + Make the endpoint file group-accessible + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + cookie + + A cookie which identifies the match, so it can be referred to when + removing it. + + + + + items + + + Items to define the actual rules of the matches. The following item + types are expected. Each item will create one new match rule. + + + + KDBUS_ITEM_BLOOM_MASK + + + An item that carries the bloom filter mask to match against + in its data field. The payload size must match the bloom + filter size that was specified when the bus was created. + See the "Bloom filters" section above for more information on + bloom filters. + + + + + + KDBUS_ITEM_NAME + + + When used as part of kernel notifications, this item specifies + a name that is acquired, lost or that changed its owner (see + below). When used as part of a match for user-generated signal + messages, it specifies a name that the sending connection must + own at the time of sending the signal. + + + + + + KDBUS_ITEM_ID + + + Specify a sender connection's ID that will match this rule. + For kernel notifications, this specifies the ID of a + connection that was added to or removed from the bus. + For used-generated signals, it specifies the ID of the + connection that sent the signal message. + + + + + + KDBUS_ITEM_NAME_ADD + KDBUS_ITEM_NAME_REMOVE + KDBUS_ITEM_NAME_CHANGE + + + These items request delivery of kernel notifications that + describe a name acquisition, loss, or change. The details + are stored in the item's + kdbus_notify_name_change member. + All information specified must be matched in order to make + the message pass. Use + KDBUS_MATCH_ID_ANY to + match against any unique connection ID. + + + + + + KDBUS_ITEM_ID_ADD + KDBUS_ITEM_ID_REMOVE + + + These items request delivery of kernel notifications that are + generated when a connection is created or terminated. + struct kdbus_notify_id_change is used to + store the actual match information. This item can be used to + monitor one particular connection ID, or, when the ID field + is set to KDBUS_MATCH_ID_ANY, + all of them. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + Refer to + + kdbus.message + 7 + + for more information on message types. + + + + + Removing a match + + Matches can be removed with the + KDBUS_CMD_MATCH_REMOVE ioctl, which takes + struct kdbus_cmd_match as argument, but its fields + usage slightly differs compared to that of + KDBUS_CMD_MATCH_ADD. + + + +struct kdbus_cmd_match { + __u64 size; + __u64 cookie; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + cookie + + The cookie of the match, as it was passed when the match was added. + All matches that have this cookie will be removed. + + + + + flags + + No flags are supported for this use case. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will fail with + -1, errno is set to + EPROTO, and the flags field + is set to 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + No items are supported for this use case, but + KDBUS_ITEM_NEGOTIATE is allowed nevertheless. + + + + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_MATCH_ADD</constant> may fail with the following + errors + + + + + EINVAL + + Illegal flags or items. + + + + + EDOM + + Illegal bloom filter size. + + + + + EMFILE + + Too many matches for this connection. + + + + + + + + <constant>KDBUS_CMD_MATCH_REMOVE</constant> may fail with the following + errors + + + + + EINVAL + + Illegal flags. + + + + + EBADSLT + + A match entry with the given cookie could not be found. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.match + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.message.xml linux-4.2-pck/Documentation/kdbus/kdbus.message.xml --- linux-4.2/Documentation/kdbus/kdbus.message.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.message.xml 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,1276 @@ + + + + + + + kdbus.message + kdbus.message + + + + kdbus.message + 7 + + + + kdbus.message + kdbus message + + + + Description + + + A kdbus message is used to exchange information between two connections + on a bus, or to transport notifications from the kernel to one or many + connections. This document describes the layout of messages, how payload + is added to them and how they are sent and received. + + + + + Message layout + + The layout of a message is shown below. + + + +-------------------------------------------------------------------------+ + | Message | + | +---------------------------------------------------------------------+ | + | | Header | | + | | size: overall message size, including the data records | | + | | destination: connection ID of the receiver | | + | | source: connection ID of the sender (set by kernel) | | + | | payload_type: "DBusDBus" textual identifier stored as uint64_t | | + | +---------------------------------------------------------------------+ | + | +---------------------------------------------------------------------+ | + | | Data Record | | + | | size: overall record size (without padding) | | + | | type: type of data | | + | | data: reference to data (address or file descriptor) | | + | +---------------------------------------------------------------------+ | + | +---------------------------------------------------------------------+ | + | | padding bytes to the next 8 byte alignment | | + | +---------------------------------------------------------------------+ | + | +---------------------------------------------------------------------+ | + | | Data Record | | + | | size: overall record size (without padding) | | + | | ... | | + | +---------------------------------------------------------------------+ | + | +---------------------------------------------------------------------+ | + | | padding bytes to the next 8 byte alignment | | + | +---------------------------------------------------------------------+ | + | +---------------------------------------------------------------------+ | + | | Data Record | | + | | size: overall record size | | + | | ... | | + | +---------------------------------------------------------------------+ | + | ... further data records ... | + +-------------------------------------------------------------------------+ + + + + + Message payload + + + When connecting to the bus, receivers request a memory pool of a given + size, large enough to carry all backlog of data enqueued for the + connection. The pool is internally backed by a shared memory file which + can be mmap()ed by the receiver. See + + kdbus.pool + 7 + + for more information. + + + + Message payload must be described in items attached to a message when + it is sent. A receiver can access the payload by looking at the items + that are attached to a message in its pool. The following items are used. + + + + + KDBUS_ITEM_PAYLOAD_VEC + + + This item references a piece of memory on the sender side which is + directly copied into the receiver's pool. This way, two peers can + exchange data by effectively doing a single-copy from one process + to another; the kernel will not buffer the data anywhere else. + This item is never found in a message received by a connection. + + + + + + KDBUS_ITEM_PAYLOAD_OFF + + + This item is attached to messages on the receiving side and points + to a memory area inside the receiver's pool. The + offset variable in the item denotes the memory + location relative to the message itself. + + + + + + KDBUS_ITEM_PAYLOAD_MEMFD + + + Messages can reference memfd files which + contain the data. memfd files are tmpfs-backed files that allow + sealing of the content of the file, which prevents all writable + access to the file content. + + + Only memfds that have + (F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_SEAL) + + set are accepted as payload data, which enforces reliable passing of + data. The receiver can assume that neither the sender nor anyone + else can alter the content after the message is sent. If those + seals are not set on the memfd, the ioctl will fail with + -1, and errno will be + set to ETXTBUSY. + + + + + + KDBUS_ITEM_FDS + + + Messages can transport regular file descriptors via + KDBUS_ITEM_FDS. This item carries an array + of int values in item.fd. The + maximum number of file descriptors in the item is + 253, and only one item of this type is + accepted per message. All passed values must be valid file + descriptors; the open count of each file descriptors is increased + by installing it to the receiver's task. This item can only be + used for directed messages, not for broadcasts, and only to + remote peers that have opted-in for receiving file descriptors + at connection time (KDBUS_HELLO_ACCEPT_FD). + + + + + + + The sender must not make any assumptions on the type in which data is + received by the remote peer. The kernel is free to re-pack multiple + KDBUS_ITEM_PAYLOAD_VEC and + KDBUS_ITEM_PAYLOAD_MEMFD payloads. For instance, the + kernel may decide to merge multiple VECs into a + single VEC, inline MEMFD + payloads into memory, or merge all passed VECs into a + single MEMFD. However, the kernel preserves the order + of passed data. This means that the order of all VEC + and MEMFD items is not changed in respect to each + other. In other words: All passed VEC and + MEMFD data payloads are treated as a single stream + of data that may be received by the remote peer in a different set of + chunks than it was sent as. + + + + + Sending messages + + + Messages are passed to the kernel with the + KDBUS_CMD_SEND ioctl. Depending on the destination + address of the message, the kernel delivers the message to the specific + destination connection, or to some subset of all connections on the same + bus. Sending messages across buses is not possible. Messages are always + queued in the memory pool of the destination connection (see above). + + + + The KDBUS_CMD_SEND ioctl uses a + struct kdbus_cmd_send to describe the message + transfer. + + +struct kdbus_cmd_send { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 msg_address; + struct kdbus_msg_info reply; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + Flags for message delivery + + + KDBUS_SEND_SYNC_REPLY + + + By default, all calls to kdbus are considered asynchronous, + non-blocking. However, as there are many use cases that need + to wait for a remote peer to answer a method call, there's a + way to send a message and wait for a reply in a synchronous + fashion. This is what the + KDBUS_SEND_SYNC_REPLY controls. The + KDBUS_CMD_SEND ioctl will block until the + reply has arrived, the timeout limit is reached, in case the + remote connection was shut down, or if interrupted by a signal + before any reply; see + + signal + 7 + . + + The offset of the reply message in the sender's pool is stored + in reply when the ioctl has returned without + error. Hence, there is no need for another + KDBUS_CMD_RECV ioctl or anything else to + receive the reply. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Request a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will fail with + -1, errno + is set to EPROTO. + Once the ioctl returned, the flags + field will have all bits set that the kernel recognizes as + valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + msg_address + + In this field, users have to provide a pointer to a message + (struct kdbus_msg) to send. See below for a + detailed description. + + + + + reply + + Only used for synchronous replies. See description of + struct kdbus_cmd_recv for more details. + + + + + items + + + The following items are currently recognized. + + + + KDBUS_ITEM_CANCEL_FD + + + When this optional item is passed in, and the call is + executed as SYNC call, the passed in file descriptor can be + used as alternative cancellation point. The kernel will call + + poll + 2 + + on this file descriptor, and once it reports any incoming + bytes, the blocking send operation will be canceled; the + blocking, synchronous ioctl call will return + -1, and errno will + be set to ECANCELED. + Any type of file descriptor on which + + poll + 2 + + can be called on can be used as payload to this item; for + example, an eventfd can be used for this purpose, see + + eventfd + 2 + . + For asynchronous message sending, this item is allowed but + ignored. + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + The message referenced by the msg_address above has + the following layout. + + + +struct kdbus_msg { + __u64 size; + __u64 flags; + __s64 priority; + __u64 dst_id; + __u64 src_id; + __u64 payload_type; + __u64 cookie; + __u64 timeout_ns; + __u64 cookie_reply; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + Flags to describe message details. + + + KDBUS_MSG_EXPECT_REPLY + + + Expect a reply to this message from the remote peer. With + this bit set, the timeout_ns field must be set to a non-zero + number of nanoseconds in which the receiving peer is expected + to reply. If such a reply is not received in time, the sender + will be notified with a timeout message (see below). The + value must be an absolute value, in nanoseconds and based on + CLOCK_MONOTONIC. + + For a message to be accepted as reply, it must be a direct + message to the original sender (not a broadcast and not a + signal message), and its + kdbus_msg.cookie_reply must match the + previous message's kdbus_msg.cookie. + + Expected replies also temporarily open the policy of the + sending connection, so the other peer is allowed to respond + within the given time window. + + + + + + KDBUS_MSG_NO_AUTO_START + + + By default, when a message is sent to an activator + connection, the activator is notified and will start an + implementer. This flag inhibits that behavior. With this bit + set, and the remote being an activator, the ioctl will fail + with errno set to + EADDRNOTAVAIL. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + priority + + The priority of this message. Receiving messages (see below) may + optionally be constrained to messages of a minimal priority. This + allows for use cases where timing critical data is interleaved with + control data on the same connection. If unused, the priority field + should be set to 0. + + + + + dst_id + + The numeric ID of the destination connection, or + KDBUS_DST_ID_BROADCAST + (~0ULL) to address every peer on the bus, or + KDBUS_DST_ID_NAME (0) to look + it up dynamically from the bus' name registry. + In the latter case, an item of type + KDBUS_ITEM_DST_NAME is mandatory. + Also see + + kdbus.name + 7 + + . + + + + + src_id + + Upon return of the ioctl, this member will contain the sending + connection's numerical ID. Should be 0 at send time. + + + + + payload_type + + Type of the payload in the actual data records. Currently, only + KDBUS_PAYLOAD_DBUS is accepted as input value + of this field. When receiving messages that are generated by the + kernel (notifications), this field will contain + KDBUS_PAYLOAD_KERNEL. + + + + + cookie + + Cookie of this message, for later recognition. Also, when replying + to a message (see above), the cookie_reply + field must match this value. + + + + + timeout_ns + + If the message sent requires a reply from the remote peer (see above), + this field contains the timeout in absolute nanoseconds based on + CLOCK_MONOTONIC. Also see + + clock_gettime + 2 + . + + + + + cookie_reply + + If the message sent is a reply to another message, this field must + match the cookie of the formerly received message. + + + + + items + + + A dynamically sized list of items to contain additional information. + The following items are expected/valid: + + + + KDBUS_ITEM_PAYLOAD_VEC + KDBUS_ITEM_PAYLOAD_MEMFD + KDBUS_ITEM_FDS + + + Actual data records containing the payload. See section + "Message payload". + + + + + + KDBUS_ITEM_BLOOM_FILTER + + + Bloom filter for matches (see below). + + + + + + KDBUS_ITEM_DST_NAME + + + Well-known name to send this message to. Required if + dst_id is set to + KDBUS_DST_ID_NAME. + If a connection holding the given name can't be found, + the ioctl will fail with errno set to + ESRCH is returned. + + + For messages to a unique name (ID), this item is optional. If + present, the kernel will make sure the name owner matches the + given unique name. This allows programs to tie the message + sending to the condition that a name is currently owned by a + certain unique name. + + + + + + + + + + The message will be augmented by the requested metadata items when + queued into the receiver's pool. See + + kdbus.connection + 7 + + and + + kdbus.item + 7 + + for more information on metadata. + + + + + Receiving messages + + + Messages are received by the client with the + KDBUS_CMD_RECV ioctl. The endpoint file of the bus + supports poll()/epoll()/select(); when new messages + are available on the connection's file descriptor, + POLLIN is reported. For compatibility reasons, + POLLOUT is always reported as well. Note, however, + that the latter does not guarantee that a message can in fact be sent, as + this depends on how many pending messages the receiver has in its pool. + + + + With the KDBUS_CMD_RECV ioctl, a + struct kdbus_cmd_recv is used. + + + +struct kdbus_cmd_recv { + __u64 size; + __u64 flags; + __u64 return_flags; + __s64 priority; + __u64 dropped_msgs; + struct kdbus_msg_info msg; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + Flags to control the receive command. + + + KDBUS_RECV_PEEK + + + Just return the location of the next message. Do not install + file descriptors or anything else. This is usually used to + determine the sender of the next queued message. + + + + + + KDBUS_RECV_DROP + + + Drop the next message without doing anything else with it, + and free the pool slice. This a short-cut for + KDBUS_RECV_PEEK and + KDBUS_CMD_FREE. + + + + + + KDBUS_RECV_USE_PRIORITY + + + Dequeue the messages ordered by their priority, and filtering + them with the priority field (see below). + + + + + + KDBUS_FLAG_NEGOTIATE + + + Request a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will fail with + -1, errno + is set to EPROTO. + Once the ioctl returned, the flags + field will have all bits set that the kernel recognizes as + valid for this command. + + + + + + + + + return_flags + + Flags returned by the kernel. If the dropped_msgs + field is non-zero, KDBUS_RECV_RETURN_DROPPED_MSGS + is set. If a file descriptor could not be installed, the + KDBUS_RECV_RETURN_INCOMPLETE_FDS flag is set. + + + + + priority + + With KDBUS_RECV_USE_PRIORITY set in + flags, messages will be dequeued ordered by their + priority, starting with the highest value. Also, messages will be + filtered by the value given in this field, so the returned message + will at least have the requested priority. If no such message is + waiting in the queue, the ioctl will fail, and + errno will be set to EAGAIN. + + + + + dropped_msgs + + Whenever a message with KDBUS_MSG_SIGNAL is sent + but cannot be queued on a peer (e.g., as it contains FDs but the peer + does not support FDs, or there is no space left in the peer's pool) + the 'dropped_msgs' counter of the peer is incremented. On the next + RECV ioctl, the 'dropped_msgs' field is copied into the ioctl struct + and cleared on the peer. If it was non-zero, the + KDBUS_RECV_RETURN_DROPPED_MSGS flag will be set + in return_flags. Note that this will only happen + if the ioctl succeeded or failed with EAGAIN. In + other error cases, the 'dropped_msgs' field of the peer is left + untouched. + + + + + msg + + Embedded struct containing information on the received message when + this command succeeded (see below). + + + + + items + + Items to specify further details for the receive command. + Currently unused, and all items will be rejected with + errno set to EINVAL. + + + + + + Both struct kdbus_cmd_recv and + struct kdbus_cmd_send embed + struct kdbus_msg_info. + For the KDBUS_CMD_SEND ioctl, it is used to catch + synchronous replies, if one was requested, and is unused otherwise. + + + +struct kdbus_msg_info { + __u64 offset; + __u64 msg_size; + __u64 return_flags; +}; + + + The fields in this struct are described below. + + + + offset + + Upon return of the ioctl, this field contains the offset in the + receiver's memory pool. The memory must be freed with + KDBUS_CMD_FREE. See + + kdbus.pool + 7 + + for further details. + + + + + msg_size + + Upon successful return of the ioctl, this field contains the size of + the allocated slice at offset offset. + It is the combination of the size of the stored + struct kdbus_msg object plus all appended VECs. + You can use it in combination with offset to map + a single message, instead of mapping the entire pool. See + + kdbus.pool + 7 + + for further details. + + + + + return_flags + + + Kernel-provided return flags. Currently, the following flags are + defined. + + + + KDBUS_RECV_RETURN_INCOMPLETE_FDS + + + The message contained memfds or file descriptors, and the + kernel failed to install one or more of them at receive time. + Most probably that happened because the maximum number of + file descriptors for the receiver's task were exceeded. + In such cases, the message is still delivered, so this is not + a fatal condition. File descriptors numbers inside the + KDBUS_ITEM_FDS item or memfd files + referenced by KDBUS_ITEM_PAYLOAD_MEMFD + items which could not be installed will be set to + -1. + + + + + + + + + + Unless KDBUS_RECV_DROP was passed, the + offset field contains the location of the new message + inside the receiver's pool after the KDBUS_CMD_RECV + ioctl was employed. The message is stored as struct kdbus_msg + at this offset, and can be interpreted with the semantics described above. + + + Also, if the connection allowed for file descriptor to be passed + (KDBUS_HELLO_ACCEPT_FD), and if the message contained + any, they will be installed into the receiving process when the + KDBUS_CMD_RECV ioctl is called. + memfds may always be part of the message payload. + The receiving task is obliged to close all file descriptors appropriately + once no longer needed. If KDBUS_RECV_PEEK is set, no + file descriptors are installed. This allows for peeking at a message, + looking at its metadata only and dropping it via + KDBUS_RECV_DROP, without installing any of the file + descriptors into the receiving process. + + + The caller is obliged to call the KDBUS_CMD_FREE + ioctl with the returned offset when the memory is no longer needed. + + + + + Notifications + + A kernel notification is a regular kdbus message with the following + details. + + + + + kdbus_msg.src_id == KDBUS_SRC_ID_KERNEL + + + kdbus_msg.dst_id == KDBUS_DST_ID_BROADCAST + + + kdbus_msg.payload_type == KDBUS_PAYLOAD_KERNEL + + + Has exactly one of the items attached that are described below. + + + Always has a timestamp item (KDBUS_ITEM_TIMESTAMP) + attached. + + + + + The kernel will notify its users of the following events. + + + + + When connection A is terminated while connection + B is waiting for a reply from it, connection + B is notified with a message with an item of + type KDBUS_ITEM_REPLY_DEAD. + + + + When connection A does not receive a reply from + connection B within the specified timeout window, + connection A will receive a message with an + item of type KDBUS_ITEM_REPLY_TIMEOUT. + + + + When an ordinary connection (not a monitor) is created on or removed + from a bus, messages with an item of type + KDBUS_ITEM_ID_ADD or + KDBUS_ITEM_ID_REMOVE, respectively, are delivered + to all bus members that match these messages through their match + database. Eavesdroppers (monitor connections) do not cause such + notifications to be sent. They are invisible on the bus. + + + + When a connection gains or loses ownership of a name, messages with an + item of type KDBUS_ITEM_NAME_ADD, + KDBUS_ITEM_NAME_REMOVE or + KDBUS_ITEM_NAME_CHANGE are delivered to all bus + members that match these messages through their match database. + + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_SEND</constant> may fail with the following + errors + + + + + EOPNOTSUPP + + The connection is not an ordinary connection, or the passed + file descriptors in KDBUS_ITEM_FDS item are + either kdbus handles or unix domain sockets. Both are currently + unsupported. + + + + + EINVAL + + The submitted payload type is + KDBUS_PAYLOAD_KERNEL, + KDBUS_MSG_EXPECT_REPLY was set without timeout + or cookie values, KDBUS_SEND_SYNC_REPLY was + set without KDBUS_MSG_EXPECT_REPLY, an invalid + item was supplied, src_id was non-zero and was + different from the current connection's ID, a supplied memfd had a + size of 0, or a string was not properly null-terminated. + + + + + ENOTUNIQ + + The supplied destination is + KDBUS_DST_ID_BROADCAST and either + file descriptors were passed, or + KDBUS_MSG_EXPECT_REPLY was set, + or a timeout was given. + + + + + E2BIG + + Too many items. + + + + + EMSGSIZE + + The size of the message header and items or the payload vector + is excessive. + + + + + EEXIST + + Multiple KDBUS_ITEM_FDS, + KDBUS_ITEM_BLOOM_FILTER or + KDBUS_ITEM_DST_NAME items were supplied. + + + + + EBADF + + The supplied KDBUS_ITEM_FDS or + KDBUS_ITEM_PAYLOAD_MEMFD items + contained an illegal file descriptor. + + + + + EMEDIUMTYPE + + The supplied memfd is not a sealed kdbus memfd. + + + + + EMFILE + + Too many file descriptors inside a + KDBUS_ITEM_FDS. + + + + + EBADMSG + + An item had illegal size, both a dst_id and a + KDBUS_ITEM_DST_NAME was given, or both a name + and a bloom filter was given. + + + + + ETXTBSY + + The supplied kdbus memfd file cannot be sealed or the seal + was removed, because it is shared with other processes or + still mapped with + + mmap + 2 + . + + + + + ECOMM + + A peer does not accept the file descriptors addressed to it. + + + + + EFAULT + + The supplied bloom filter size was not 64-bit aligned, or supplied + memory could not be accessed by the kernel. + + + + + EDOM + + The supplied bloom filter size did not match the bloom filter + size of the bus. + + + + + EDESTADDRREQ + + dst_id was set to + KDBUS_DST_ID_NAME, but no + KDBUS_ITEM_DST_NAME was attached. + + + + + ESRCH + + The name to look up was not found in the name registry. + + + + + EADDRNOTAVAIL + + KDBUS_MSG_NO_AUTO_START was given but the + destination connection is an activator. + + + + + ENXIO + + The passed numeric destination connection ID couldn't be found, + or is not connected. + + + + + ECONNRESET + + The destination connection is no longer active. + + + + + ETIMEDOUT + + Timeout while synchronously waiting for a reply. + + + + + EINTR + + Interrupted system call while synchronously waiting for a reply. + + + + + EPIPE + + When sending a message, a synchronous reply from the receiving + connection was expected but the connection died before answering. + + + + + ENOBUFS + + Too many pending messages on the receiver side. + + + + + EREMCHG + + Both a well-known name and a unique name (ID) was given, but + the name is not currently owned by that connection. + + + + + EXFULL + + The memory pool of the receiver is full. + + + + + EREMOTEIO + + While synchronously waiting for a reply, the remote peer + failed with an I/O error. + + + + + + + + <constant>KDBUS_CMD_RECV</constant> may fail with the following + errors + + + + + EOPNOTSUPP + + The connection is not an ordinary connection, or the passed + file descriptors are either kdbus handles or unix domain + sockets. Both are currently unsupported. + + + + + EINVAL + + Invalid flags or offset. + + + + + EAGAIN + + No message found in the queue. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + clock_gettime + 2 + + + + + ioctl + 2 + + + + + poll + 2 + + + + + select + 2 + + + + + epoll + 7 + + + + + eventfd + 2 + + + + + memfd_create + 2 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.name.xml linux-4.2-pck/Documentation/kdbus/kdbus.name.xml --- linux-4.2/Documentation/kdbus/kdbus.name.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.name.xml 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,711 @@ + + + + + + + kdbus.name + kdbus.name + + + + kdbus.name + 7 + + + + kdbus.name + kdbus.name + + + + Description + + Each + + kdbus.bus + 7 + + instantiates a name registry to resolve well-known names into unique + connection IDs for message delivery. The registry will be queried when a + message is sent with kdbus_msg.dst_id set to + KDBUS_DST_ID_NAME, or when a registry dump is + requested with KDBUS_CMD_NAME_LIST. + + + + All of the below is subject to policy rules for SEE + and OWN permissions. See + + kdbus.policy + 7 + + for more information. + + + + + Name validity + + A name has to comply with the following rules in order to be considered + valid. + + + + + + The name has two or more elements separated by a + '.' (period) character. + + + + + All elements must contain at least one character. + + + + + Each element must only contain the ASCII characters + [A-Z][a-z][0-9]_ and must not begin with a + digit. + + + + + The name must contain at least one '.' (period) + character (and thus at least two elements). + + + + + The name must not begin with a '.' (period) + character. + + + + + The name must not exceed 255 characters in + length. + + + + + + + Acquiring a name + + To acquire a name, a client uses the + KDBUS_CMD_NAME_ACQUIRE ioctl with + struct kdbus_cmd as argument. + + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + Flags to control details in the name acquisition. + + + KDBUS_NAME_REPLACE_EXISTING + + + Acquiring a name that is already present usually fails, + unless this flag is set in the call, and + KDBUS_NAME_ALLOW_REPLACEMENT (see below) + was set when the current owner of the name acquired it, or + if the current owner is an activator connection (see + + kdbus.connection + 7 + ). + + + + + + KDBUS_NAME_ALLOW_REPLACEMENT + + + Allow other connections to take over this name. When this + happens, the former owner of the connection will be notified + of the name loss. + + + + + + KDBUS_NAME_QUEUE + + + A name that is already acquired by a connection can not be + acquired again (unless the + KDBUS_NAME_ALLOW_REPLACEMENT flag was + set during acquisition; see above). + However, a connection can put itself in a queue of + connections waiting for the name to be released. Once that + happens, the first connection in that queue becomes the new + owner and is notified accordingly. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Request a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will fail with + -1, and errno + is set to EPROTO. + Once the ioctl returned, the flags + field will have all bits set that the kernel recognizes as + valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + + Flags returned by the kernel. Currently, the following may be + returned by the kernel. + + + + KDBUS_NAME_IN_QUEUE + + + The name was not acquired yet, but the connection was + placed in the queue of peers waiting for the name. + This can only happen if KDBUS_NAME_QUEUE + was set in the flags member (see above). + The connection will receive a name owner change notification + once the current owner has given up the name and its + ownership was transferred. + + + + + + + + + items + + + Items to submit the name. Currently, one item of type + KDBUS_ITEM_NAME is expected and allowed, and + the contained string must be a valid bus name. + KDBUS_ITEM_NEGOTIATE may be used to probe for + valid item types. See + + kdbus.item + 7 + + for a detailed description of how this item is used. + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to >EINVAL. + + + + + + + + Releasing a name + + A connection may release a name explicitly with the + KDBUS_CMD_NAME_RELEASE ioctl. If the connection was + an implementer of an activatable name, its pending messages are moved + back to the activator. If there are any connections queued up as waiters + for the name, the first one in the queue (the oldest entry) will become + the new owner. The same happens implicitly for all names once a + connection terminates. See + + kdbus.connection + 7 + + for more information on connections. + + + The KDBUS_CMD_NAME_RELEASE ioctl uses the same data + structure as the acquisition call + (KDBUS_CMD_NAME_ACQUIRE), + but with slightly different field usage. + + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Flags to the command. Currently unused. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will return 0, + and the flags field is set to + 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + Items to submit the name. Currently, one item of type + KDBUS_ITEM_NAME is expected and allowed, and + the contained string must be a valid bus name. + KDBUS_ITEM_NEGOTIATE may be used to probe for + valid item types. See + + kdbus.item + 7 + + for a detailed description of how this item is used. + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + + Dumping the name registry + + A connection may request a complete or filtered dump of currently active + bus names with the KDBUS_CMD_LIST ioctl, which + takes a struct kdbus_cmd_list as argument. + + + +struct kdbus_cmd_list { + __u64 flags; + __u64 return_flags; + __u64 offset; +}; + + + The fields in this struct are described below. + + + + flags + + + Any combination of flags to specify which names should be dumped. + + + + KDBUS_LIST_UNIQUE + + + List the unique (numeric) IDs of the connection, whether it + owns a name or not. + + + + + + KDBUS_LIST_NAMES + + + List well-known names stored in the database which are + actively owned by a real connection (not an activator). + + + + + + KDBUS_LIST_ACTIVATORS + + + List names that are owned by an activator. + + + + + + KDBUS_LIST_QUEUED + + + List connections that are not yet owning a name but are + waiting for it to become available. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Request a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will fail with + -1, and errno + is set to EPROTO. + Once the ioctl returned, the flags + field will have all bits set that the kernel recognizes as + valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + offset + + When the ioctl returns successfully, the offset to the name registry + dump inside the connection's pool will be stored in this field. + + + + + + The returned list of names is stored in a struct kdbus_list + that in turn contains an array of type struct kdbus_info, + The array-size in bytes is given as list_size. + The fields inside struct kdbus_info is described next. + + + +struct kdbus_info { + __u64 size; + __u64 id; + __u64 flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + id + + The owning connection's unique ID. + + + + + flags + + The flags of the owning connection. + + + + + items + + + Items containing the actual name. Currently, one item of type + KDBUS_ITEM_OWNED_NAME will be attached, + including the name's flags. In that item, the flags field of the + name may carry the following bits: + + + + KDBUS_NAME_ALLOW_REPLACEMENT + + + Other connections are allowed to take over this name from the + connection that owns it. + + + + + + KDBUS_NAME_IN_QUEUE + + + When retrieving a list of currently acquired names in the + registry, this flag indicates whether the connection + actually owns the name or is currently waiting for it to + become available. + + + + + + KDBUS_NAME_ACTIVATOR + + + An activator connection owns a name as a placeholder for an + implementer, which is started on demand by programs as soon + as the first message arrives. There's some more information + on this topic in + + kdbus.connection + 7 + + . + + + In contrast to + KDBUS_NAME_REPLACE_EXISTING, + when a name is taken over from an activator connection, all + the messages that have been queued in the activator + connection will be moved over to the new owner. The activator + connection will still be tracked for the name and will take + control again if the implementer connection terminates. + + + This flag can not be used when acquiring a name, but is + implicitly set through KDBUS_CMD_HELLO + with KDBUS_HELLO_ACTIVATOR set in + kdbus_cmd_hello.conn_flags. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + + The returned buffer must be freed with the + KDBUS_CMD_FREE ioctl when the user is finished with + it. See + + kdbus.pool + 7 + + for more information. + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_NAME_ACQUIRE</constant> may fail with the following + errors + + + + + EINVAL + + Illegal command flags, illegal name provided, or an activator + tried to acquire a second name. + + + + + EPERM + + Policy prohibited name ownership. + + + + + EALREADY + + Connection already owns that name. + + + + + EEXIST + + The name already exists and can not be taken over. + + + + + E2BIG + + The maximum number of well-known names per connection is exhausted. + + + + + + + + <constant>KDBUS_CMD_NAME_RELEASE</constant> + may fail with the following errors + + + + + EINVAL + + Invalid command flags, or invalid name provided. + + + + + ESRCH + + Name is not found in the registry. + + + + + EADDRINUSE + + Name is owned by a different connection and can't be released. + + + + + + + + <constant>KDBUS_CMD_LIST</constant> may fail with the following + errors + + + + + EINVAL + + Invalid command flags + + + + + ENOBUFS + + No available memory in the connection's pool. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.item + 7 + + + + + kdbus.policy + 7 + + + + + kdbus.pool + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.policy.xml linux-4.2-pck/Documentation/kdbus/kdbus.policy.xml --- linux-4.2/Documentation/kdbus/kdbus.policy.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.policy.xml 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,406 @@ + + + + + + + kdbus.policy + kdbus.policy + + + + kdbus.policy + 7 + + + + kdbus.policy + kdbus policy + + + + Description + + + A kdbus policy restricts the possibilities of connections to own, see and + talk to well-known names. A policy can be associated with a bus (through a + policy holder connection) or a custom endpoint. kdbus stores its policy + information in a database that can be accessed through the following + ioctl commands: + + + + + KDBUS_CMD_HELLO + + When creating, or updating, a policy holder connection. See + + kdbus.connection + 7 + . + + + + + KDBUS_CMD_ENDPOINT_MAKE + KDBUS_CMD_ENDPOINT_UPDATE + + When creating, or updating, a bus custom endpoint. See + + kdbus.endpoint + 7 + . + + + + + + In all cases, the name and policy access information is stored in items + of type KDBUS_ITEM_NAME and + KDBUS_ITEM_POLICY_ACCESS. For this transport, the + following rules apply. + + + + + + An item of type KDBUS_ITEM_NAME must be followed + by at least one KDBUS_ITEM_POLICY_ACCESS item. + + + + + + An item of type KDBUS_ITEM_NAME can be followed + by an arbitrary number of + KDBUS_ITEM_POLICY_ACCESS items. + + + + + + An arbitrary number of groups of names and access levels can be given. + + + + + + Names passed in items of type KDBUS_ITEM_NAME must + comply to the rules of valid kdbus.name. See + + kdbus.name + 7 + + for more information. + + The payload of an item of type + KDBUS_ITEM_POLICY_ACCESS is defined by the following + struct. For more information on the layout of items, please refer to + + kdbus.item + 7 + . + + + +struct kdbus_policy_access { + __u64 type; + __u64 access; + __u64 id; +}; + + + The fields in this struct are described below. + + + + type + + + One of the following. + + + + + KDBUS_POLICY_ACCESS_USER + + Grant access to a user with the UID stored in the + id field. + + + + + KDBUS_POLICY_ACCESS_GROUP + + Grant access to a user with the GID stored in the + id field. + + + + + KDBUS_POLICY_ACCESS_WORLD + + Grant access to everyone. The id field + is ignored. + + + + + + + + access + + + The access to grant. One of the following. + + + + + KDBUS_POLICY_SEE + + Allow the name to be seen. + + + + + KDBUS_POLICY_TALK + + Allow the name to be talked to. + + + + + KDBUS_POLICY_OWN + + Allow the name to be owned. + + + + + + + + id + + For KDBUS_POLICY_ACCESS_USER, stores the UID. + For KDBUS_POLICY_ACCESS_GROUP, stores the GID. + + + + + + + All endpoints of buses have an empty policy database by default. + Therefore, unless policy rules are added, all operations will also be + denied by default. Also see + + kdbus.endpoint + 7 + . + + + + + Wildcard names + + Policy holder connections may upload names that contain the wildcard + suffix (".*"). Such a policy entry is effective for + every well-known name that extends the provided name by exactly one more + level. + + For example, the name foo.bar.* matches both + "foo.bar.baz" and + "foo.bar.bazbaz" are, but not + "foo.bar.baz.baz". + + This allows connections to take control over multiple names that the + policy holder doesn't need to know about when uploading the policy. + + Such wildcard entries are not allowed for custom endpoints. + + + + + Privileged connections + + The policy database is overruled when action is taken by a privileged + connection. Please refer to + + kdbus.connection + 7 + + for more information on what makes a connection privileged. + + + + + Examples + + For instance, a set of policy rules may look like this: + + + +KDBUS_ITEM_NAME: str='org.foo.bar' +KDBUS_ITEM_POLICY_ACCESS: type=USER, access=OWN, ID=1000 +KDBUS_ITEM_POLICY_ACCESS: type=USER, access=TALK, ID=1001 +KDBUS_ITEM_POLICY_ACCESS: type=WORLD, access=SEE + +KDBUS_ITEM_NAME: str='org.blah.baz' +KDBUS_ITEM_POLICY_ACCESS: type=USER, access=OWN, ID=0 +KDBUS_ITEM_POLICY_ACCESS: type=WORLD, access=TALK + + + + That means that 'org.foo.bar' may only be owned by UID 1000, but every + user on the bus is allowed to see the name. However, only UID 1001 may + actually send a message to the connection and receive a reply from it. + + The second rule allows 'org.blah.baz' to be owned by UID 0 only, but + every user may talk to it. + + + + + TALK access and multiple well-known names per connection + + Note that TALK access is checked against all names of a connection. For + example, if a connection owns both 'org.foo.bar' and + 'org.blah.baz', and the policy database allows + 'org.blah.baz' to be talked to by WORLD, then this + permission is also granted to 'org.foo.bar'. That + might sound illogical, but after all, we allow messages to be directed to + either the ID or a well-known name, and policy is applied to the + connection, not the name. In other words, the effective TALK policy for a + connection is the most permissive of all names the connection owns. + + For broadcast messages, the receiver needs TALK permissions to the sender + to receive the broadcast. + + + Both the endpoint and the bus policy databases are consulted to allow + name registry listing, owning a well-known name and message delivery. + If either one fails, the operation is failed with + errno set to EPERM. + + For best practices, connections that own names with a restricted TALK + access should not install matches. This avoids cases where the sent + message may pass the bloom filter due to false-positives and may also + satisfy the policy rules. + + Also see + + kdbus.match + 7 + . + + + + + Implicit policies + + Depending on the type of the endpoint, a set of implicit rules that + override installed policies might be enforced. + + On default endpoints, the following set is enforced and checked before + any user-supplied policy is checked. + + + + + + Privileged connections always override any installed policy. Those + connections could easily install their own policies, so there is no + reason to enforce installed policies. + + + + + Connections can always talk to connections of the same user. This + includes broadcast messages. + + + + + + Custom endpoints have stricter policies. The following rules apply: + + + + + + Policy rules are always enforced, even if the connection is a + privileged connection. + + + + + Policy rules are always enforced for TALK access, + even if both ends are running under the same user. This includes + broadcast messages. + + + + + To restrict the set of names that can be seen, endpoint policies can + install SEE policies. + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.pool.xml linux-4.2-pck/Documentation/kdbus/kdbus.pool.xml --- linux-4.2/Documentation/kdbus/kdbus.pool.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.pool.xml 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,326 @@ + + + + + + + kdbus.pool + kdbus.pool + + + + kdbus.pool + 7 + + + + kdbus.pool + kdbus pool + + + + Description + + A pool for data received from the kernel is installed for every + connection of the bus, and + is sized according to the information stored in the + pool_size member of struct kdbus_cmd_hello + when KDBUS_CMD_HELLO is employed. Internally, the + pool is segmented into slices, each referenced by its + offset in the pool, expressed in bytes. + See + + kdbus.connection + 7 + + for more information about KDBUS_CMD_HELLO. + + + + The pool is written to by the kernel when one of the following + ioctls is issued: + + + + KDBUS_CMD_HELLO + + ... to receive details about the bus the connection was made to + + + + KDBUS_CMD_RECV + + ... to receive a message + + + + KDBUS_CMD_LIST + + ... to dump the name registry + + + + KDBUS_CMD_CONN_INFO + + ... to retrieve information on a connection + + + + KDBUS_CMD_BUS_CREATOR_INFO + + ... to retrieve information about a connection's bus creator + + + + + + + The offset fields returned by either one of the + aforementioned ioctls describe offsets inside the pool. In order to make + the slice available for subsequent calls, + KDBUS_CMD_FREE has to be called on that offset + (see below). Otherwise, the pool will fill up, and the connection won't + be able to receive any more information through its pool. + + + + + Pool slice allocation + + Pool slices are allocated by the kernel in order to report information + back to a task, such as messages, returned name list etc. + Allocation of pool slices cannot be initiated by userspace. See + + kdbus.connection + 7 + + and + + kdbus.name + 7 + + for examples of commands that use the pool to + return data. + + + + + Accessing the pool memory + + Memory in the pool is read-only for userspace and may only be written + to by the kernel. To read from the pool memory, the caller is expected to + + mmap + 2 + + the buffer into its task, like this: + + +uint8_t *buf = mmap(NULL, size, PROT_READ, MAP_SHARED, conn_fd, 0); + + + + In order to map the entire pool, the size parameter in + the example above should be set to the value of the + pool_size member of + struct kdbus_cmd_hello when + KDBUS_CMD_HELLO was employed to create the + connection (see above). + + + + The file descriptor used to map the memory must be + the one that was used to create the connection. + In other words, the one that was used to call + KDBUS_CMD_HELLO. See + + kdbus.connection + 7 + + for more details. + + + + Alternatively, instead of mapping the entire pool buffer, only parts + of it can be mapped. Every kdbus command that returns an + offset (see above) also reports a + size along with it, so programs can be written + in a way that it only maps portions of the pool to access a specific + slice. + + + + When access to the pool memory is no longer needed, programs should + call munmap() on the pointer returned by + mmap(). + + + + + Freeing pool slices + + The KDBUS_CMD_FREE ioctl is used to free a slice + inside the pool, describing an offset that was returned in an + offset field of another ioctl struct. + The KDBUS_CMD_FREE command takes a + struct kdbus_cmd_free as argument. + + + +struct kdbus_cmd_free { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 offset; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Currently unused. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will return 0, + and the flags field is set to + 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + offset + + The offset to free, as returned by other ioctls that allocated + memory for returned information. + + + + + items + + Items to specify further details for the receive command. + Currently unused. + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + All items except for + KDBUS_ITEM_NEGOTIATE (see + + kdbus.item + 7 + + ) will be rejected. + + + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_FREE</constant> may fail with the following + errors + + + + + ENXIO + + No pool slice found at given offset. + + + + + EINVAL + + Invalid flags provided. + + + + + EINVAL + + The offset is valid, but the user is not allowed to free the slice. + This happens, for example, if the offset was retrieved with + KDBUS_RECV_PEEK. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.name + 7 + + + + + mmap + 2 + + + + + munmap + 2 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.xml linux-4.2-pck/Documentation/kdbus/kdbus.xml --- linux-4.2/Documentation/kdbus/kdbus.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.xml 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,1012 @@ + + + + + + + kdbus + kdbus + + + + kdbus + 7 + + + + kdbus + Kernel Message Bus + + + + Synopsis + + kdbus is an inter-process communication bus system controlled by the + kernel. It provides user-space with an API to create buses and send + unicast and multicast messages to one, or many, peers connected to the + same bus. It does not enforce any layout on the transmitted data, but + only provides the transport layer used for message interchange between + peers. + + + This set of man-pages gives a comprehensive overview of the kernel-level + API, with all ioctl commands, associated structs and bit masks. However, + most people will not use this API level directly, but rather let one of + the high-level abstraction libraries help them integrate D-Bus + functionality into their applications. + + + + + Description + + kdbus provides a pseudo filesystem called kdbusfs, + which is usually mounted on /sys/fs/kdbus. Bus + primitives can be accessed as files and sub-directories underneath this + mount-point. Any advanced operations are done via + ioctl() on files created by + kdbusfs. Multiple mount-points of + kdbusfs are independent of each other. This allows + namespacing of kdbus by mounting a new instance of + kdbusfs in a new mount-namespace. kdbus calls these + mount instances domains and each bus belongs to exactly one domain. + + + + kdbus was designed as a transport layer for D-Bus, but is in no way + limited, nor controlled by the D-Bus protocol specification. The D-Bus + protocol is one possible application layer on top of kdbus. + + + + For the general D-Bus protocol specification, its payload format, its + marshaling, and its communication semantics, please refer to the + + D-Bus specification. + + + + + + Terminology + + + Domain + + A domain is a kdbusfs mount-point containing all + the bus primitives. Each domain is independent, and separate domains + do not affect each other. + + + + + Bus + + A bus is a named object inside a domain. Clients exchange messages + over a bus. Multiple buses themselves have no connection to each other; + messages can only be exchanged on the same bus. The default endpoint of + a bus, to which clients establish connections, is the "bus" file + /sys/fs/kdbus/<bus name>/bus. + Common operating system setups create one "system bus" per system, + and one "user bus" for every logged-in user. Applications or services + may create their own private buses. The kernel driver does not + distinguish between different bus types, they are all handled the same + way. See + + kdbus.bus + 7 + + for more details. + + + + + Endpoint + + An endpoint provides a file to talk to a bus. Opening an endpoint + creates a new connection to the bus to which the endpoint belongs. All + endpoints have unique names and are accessible as files underneath the + directory of a bus, e.g., /sys/fs/kdbus/<bus>/<endpoint> + Every bus has a default endpoint called "bus". + A bus can optionally offer additional endpoints with custom names + to provide restricted access to the bus. Custom endpoints carry + additional policy which can be used to create sandboxes with + locked-down, limited, filtered access to a bus. See + + kdbus.endpoint + 7 + + for more details. + + + + + Connection + + A connection to a bus is created by opening an endpoint file of a + bus. Every ordinary client connection has a unique identifier on the + bus and can address messages to every other connection on the same + bus by using the peer's connection ID as the destination. See + + kdbus.connection + 7 + + for more details. + + + + + Pool + + Each connection allocates a piece of shmem-backed memory that is + used to receive messages and answers to ioctl commands from the kernel. + It is never used to send anything to the kernel. In order to access that + memory, an application must mmap() it into its address space. See + + kdbus.pool + 7 + + for more details. + + + + + Well-known Name + + A connection can, in addition to its implicit unique connection ID, + request the ownership of a textual well-known name. Well-known names are + noted in reverse-domain notation, such as com.example.service1. A + connection that offers a service on a bus is usually reached by its + well-known name. An analogy of connection ID and well-known name is an + IP address and a DNS name associated with that address. See + + kdbus.name + 7 + + for more details. + + + + + Message + + Connections can exchange messages with other connections by addressing + the peers with their connection ID or well-known name. A message + consists of a message header with information on how to route the + message, and the message payload, which is a logical byte stream of + arbitrary size. Messages can carry additional file descriptors to be + passed from one connection to another, just like passing file + descriptors over UNIX domain sockets. Every connection can specify which + set of metadata the kernel should attach to the message when it is + delivered to the receiving connection. Metadata contains information + like: system time stamps, UID, GID, TID, proc-starttime, well-known + names, process comm, process exe, process argv, cgroup, capabilities, + seclabel, audit session, loginuid and the connection's human-readable + name. See + + kdbus.message + 7 + + for more details. + + + + + Item + + The API of kdbus implements the notion of items, submitted through and + returned by most ioctls, and stored inside data structures in the + connection's pool. See + + kdbus.item + 7 + + for more details. + + + + + Broadcast, signal, filter, match + + Signals are messages that a receiver opts in for by installing a blob of + bytes, called a 'match'. Signal messages must always carry a + counter-part blob, called a 'filter', and signals are only delivered to + peers which have a match that white-lists the message's filter. Senders + of signal messages can use either a single connection ID as receiver, + or the special connection ID + KDBUS_DST_ID_BROADCAST to potentially send it to + all connections of a bus, following the logic described above. See + + kdbus.match + 7 + + and + + kdbus.message + 7 + + for more details. + + + + + Policy + + A policy is a set of rules that define which connections can see, talk + to, or register a well-known name on the bus. A policy is attached to + buses and custom endpoints, and modified by policy holder connections or + owners of custom endpoints. See + + kdbus.policy + 7 + + for more details. + + + + + Privileged bus users + + A user connecting to the bus is considered privileged if it is either + the creator of the bus, or if it has the CAP_IPC_OWNER capability flag + set. See + + kdbus.connection + 7 + + for more details. + + + + + + Bus Layout + + + A bus provides and defines an environment that peers + can connect to for message interchange. A bus is created via the kdbus + control interface and can be modified by the bus creator. It applies the + policy that control all bus operations. The bus creator itself does not + participate as a peer. To establish a peer + connection, you have to open one of the + endpoints of a bus. Each bus provides a default + endpoint, but further endpoints can be created on-demand. Endpoints are + used to apply additional policies for all connections on this endpoint. + Thus, they provide additional filters to further restrict access of + specific connections to the bus. + + + + Following, you can see an example bus layout: + + + + + + + + Data structures and interconnections + + + + + Metadata + + + When metadata is collected + + kdbus records data about the system in certain situations. Such metadata + can refer to the currently active process (creds, PIDs, current user + groups, process names and its executable path, cgroup membership, + capabilities, security label and audit information), connection + information (description string, currently owned names) and time stamps. + + + Metadata is collected at the following times. + + + + + When a bus is created (KDBUS_CMD_MAKE), + information about the calling task is collected. This data is returned + by the kernel via the KDBUS_CMD_BUS_CREATOR_INFO + call. + + + + + When a connection is created (KDBUS_CMD_HELLO), + information about the calling task is collected. Alternatively, a + privileged connection may provide 'faked' information about + credentials, PIDs and security labels which will be stored instead. + This data is returned by the kernel as information on a connection + (KDBUS_CMD_CONN_INFO). Only metadata that a + connection allowed to be sent (by setting its bit in + attach_flags_send) will be exported in this way. + + + + + + When a message is sent (KDBUS_CMD_SEND), + information about the sending task and the sending connection is + collected. This metadata will be attached to the message when it + arrives in the receiver's pool. If the connection sending the + message installed faked credentials (see + + kdbus.connection + 7 + ), + the message will not be augmented by any information about the + currently sending task. Note that only metadata that was requested + by the receiving connection will be collected and attached to + messages. + + + + + + Which metadata items are actually delivered depends on the following + sets and masks: + + + + + (a) the system-wide kmod creds mask + (module parameter attach_flags_mask) + + + + (b) the per-connection send creds mask, set by the connecting client + + + + (c) the per-connection receive creds mask, set by the connecting + client + + + + (d) the per-bus minimal creds mask, set by the bus creator + + + + (e) the per-bus owner creds mask, set by the bus creator + + + + (f) the mask specified when querying creds of a bus peer + + + + (g) the mask specified when querying creds of a bus owner + + + + + With the following rules: + + + + + + [1] The creds attached to messages are determined as + a & b & c. + + + + + + [2] When connecting to a bus (KDBUS_CMD_HELLO), + and ~b & d != 0, the call will fail with, + -1, and errno is set to + ECONNREFUSED. + + + + + + [3] When querying creds of a bus peer, the creds returned are + a & b & f. + + + + + + [4] When querying creds of a bus owner, the creds returned are + a & e & g. + + + + + + Hence, programs might not always get all requested metadata items that + it requested. Code must be written so that it can cope with this fact. + + + + + Benefits and heads-up + + Attaching metadata to messages has two major benefits. + + + + + Metadata attached to messages is gathered at the moment when the + other side calls KDBUS_CMD_SEND, or, + respectively, then the kernel notification is generated. There is + no need for the receiving peer to retrieve information about the + task in a second step. This closes a race gap that would otherwise + be inherent. + + + + + As metadata is delivered along with messages in the same data + blob, no extra calls to kernel functions etc. are needed to gather + them. + + + + + Note, however, that collecting metadata does come at a price for + performance, so developers should carefully assess which metadata to + really opt-in for. For best practice, data that is not needed as part + of a message should not be requested by the connection in the first + place (see attach_flags_recv in + KDBUS_CMD_HELLO). + + + + + Attach flags for metadata items + + To let the kernel know which metadata information to attach as items + to the aforementioned commands, it uses a bitmask. In those, the + following attach flags are currently supported. + Both the attach_flags_recv and + attach_flags_send fields of + struct kdbus_cmd_hello, as well as the payload of the + KDBUS_ITEM_ATTACH_FLAGS_SEND and + KDBUS_ITEM_ATTACH_FLAGS_RECV items follow this + scheme. + + + + + KDBUS_ATTACH_TIMESTAMP + + Requests the attachment of an item of type + KDBUS_ITEM_TIMESTAMP. + + + + + KDBUS_ATTACH_CREDS + + Requests the attachment of an item of type + KDBUS_ITEM_CREDS. + + + + + KDBUS_ATTACH_PIDS + + Requests the attachment of an item of type + KDBUS_ITEM_PIDS. + + + + + KDBUS_ATTACH_AUXGROUPS + + Requests the attachment of an item of type + KDBUS_ITEM_AUXGROUPS. + + + + + KDBUS_ATTACH_NAMES + + Requests the attachment of an item of type + KDBUS_ITEM_OWNED_NAME. + + + + + KDBUS_ATTACH_TID_COMM + + Requests the attachment of an item of type + KDBUS_ITEM_TID_COMM. + + + + + KDBUS_ATTACH_PID_COMM + + Requests the attachment of an item of type + KDBUS_ITEM_PID_COMM. + + + + + KDBUS_ATTACH_EXE + + Requests the attachment of an item of type + KDBUS_ITEM_EXE. + + + + + KDBUS_ATTACH_CMDLINE + + Requests the attachment of an item of type + KDBUS_ITEM_CMDLINE. + + + + + KDBUS_ATTACH_CGROUP + + Requests the attachment of an item of type + KDBUS_ITEM_CGROUP. + + + + + KDBUS_ATTACH_CAPS + + Requests the attachment of an item of type + KDBUS_ITEM_CAPS. + + + + + KDBUS_ATTACH_SECLABEL + + Requests the attachment of an item of type + KDBUS_ITEM_SECLABEL. + + + + + KDBUS_ATTACH_AUDIT + + Requests the attachment of an item of type + KDBUS_ITEM_AUDIT. + + + + + KDBUS_ATTACH_CONN_DESCRIPTION + + Requests the attachment of an item of type + KDBUS_ITEM_CONN_DESCRIPTION. + + + + + + Please refer to + + kdbus.item + 7 + + for detailed information about the layout and payload of items and + what metadata should be used to. + + + + + + The ioctl interface + + + As stated in the 'synopsis' section above, application developers are + strongly encouraged to use kdbus through one of the high-level D-Bus + abstraction libraries, rather than using the low-level API directly. + + + + kdbus on the kernel level exposes its functions exclusively through + + ioctl + 2 + , + employed on file descriptors returned by + + open + 2 + + on pseudo files exposed by + + kdbus.fs + 7 + . + + + Following is a list of all the ioctls, along with the command structs + they must be used with. + + + + + + + ioctl signature + command + transported struct + + + + + 0x40189500 + KDBUS_CMD_BUS_MAKE + struct kdbus_cmd * + + 0x40189510 + KDBUS_CMD_ENDPOINT_MAKE + struct kdbus_cmd * + + 0xc0609580 + KDBUS_CMD_HELLO + struct kdbus_cmd_hello * + + 0x40189582 + KDBUS_CMD_BYEBYE + struct kdbus_cmd * + + 0x40389590 + KDBUS_CMD_SEND + struct kdbus_cmd_send * + + 0x80409591 + KDBUS_CMD_RECV + struct kdbus_cmd_recv * + + 0x40209583 + KDBUS_CMD_FREE + struct kdbus_cmd_free * + + 0x401895a0 + KDBUS_CMD_NAME_ACQUIRE + struct kdbus_cmd * + + 0x401895a1 + KDBUS_CMD_NAME_RELEASE + struct kdbus_cmd * + + 0x80289586 + KDBUS_CMD_LIST + struct kdbus_cmd_list * + + 0x80309584 + KDBUS_CMD_CONN_INFO + struct kdbus_cmd_info * + + 0x40209551 + KDBUS_CMD_UPDATE + struct kdbus_cmd * + + 0x80309585 + KDBUS_CMD_BUS_CREATOR_INFO + struct kdbus_cmd_info * + + 0x40189511 + KDBUS_CMD_ENDPOINT_UPDATE + struct kdbus_cmd * + + 0x402095b0 + KDBUS_CMD_MATCH_ADD + struct kdbus_cmd_match * + + 0x402095b1 + KDBUS_CMD_MATCH_REMOVE + struct kdbus_cmd_match * + + + + + + + Depending on the type of kdbusfs node that was + opened and what ioctls have been executed on a file descriptor before, + a different sub-set of ioctl commands is allowed. + + + + + + On a file descriptor resulting from opening a + control node, only the + KDBUS_CMD_BUS_MAKE ioctl may be executed. + + + + + On a file descriptor resulting from opening a + bus endpoint node, only the + KDBUS_CMD_ENDPOINT_MAKE and + KDBUS_CMD_HELLO ioctls may be executed. + + + + + A file descriptor that was used to create a bus + (via KDBUS_CMD_BUS_MAKE) is called a + bus owner file descriptor. The bus will be + active as long as the file descriptor is kept open. + A bus owner file descriptor can not be used to + employ any further ioctls. As soon as + + close + 2 + + is called on it, the bus will be shut down, along will all associated + endpoints and connections. See + + kdbus.bus + 7 + + for more details. + + + + + A file descriptor that was used to create an endpoint + (via KDBUS_CMD_ENDPOINT_MAKE) is called an + endpoint owner file descriptor. The endpoint + will be active as long as the file descriptor is kept open. + An endpoint owner file descriptor can only be used + to update details of an endpoint through the + KDBUS_CMD_ENDPOINT_UPDATE ioctl. As soon as + + close + 2 + + is called on it, the endpoint will be removed from the bus, and all + connections that are connected to the bus through it are shut down. + See + + kdbus.endpoint + 7 + + for more details. + + + + + A file descriptor that was used to create a connection + (via KDBUS_CMD_HELLO) is called a + connection owner file descriptor. The connection + will be active as long as the file descriptor is kept open. + A connection owner file descriptor may be used to + issue any of the following ioctls. + + + + + KDBUS_CMD_UPDATE to tweak details of the + connection. See + + kdbus.connection + 7 + . + + + + KDBUS_CMD_BYEBYE to shut down a connection + without losing messages. See + + kdbus.connection + 7 + . + + + + KDBUS_CMD_FREE to free a slice of memory in + the pool. See + + kdbus.pool + 7 + . + + + + KDBUS_CMD_CONN_INFO to retrieve information + on other connections on the bus. See + + kdbus.connection + 7 + . + + + + KDBUS_CMD_BUS_CREATOR_INFO to retrieve + information on the bus creator. See + + kdbus.connection + 7 + . + + + + KDBUS_CMD_LIST to retrieve a list of + currently active well-known names and unique IDs on the bus. See + + kdbus.name + 7 + . + + + + KDBUS_CMD_SEND and + KDBUS_CMD_RECV to send or receive a message. + See + + kdbus.message + 7 + . + + + + KDBUS_CMD_NAME_ACQUIRE and + KDBUS_CMD_NAME_RELEASE to acquire or release + a well-known name on the bus. See + + kdbus.name + 7 + . + + + + KDBUS_CMD_MATCH_ADD and + KDBUS_CMD_MATCH_REMOVE to add or remove + a match for signal messages. See + + kdbus.match + 7 + . + + + + + + + These ioctls, along with the structs they transport, are explained in + detail in the other documents linked to in the "See Also" section below. + + + + + See Also + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + ioctl + 2 + + + + + mmap + 2 + + + + + open + 2 + + + + + close + 2 + + + + D-Bus + + + + + diff -Nur linux-4.2/Documentation/kdbus/stylesheet.xsl linux-4.2-pck/Documentation/kdbus/stylesheet.xsl --- linux-4.2/Documentation/kdbus/stylesheet.xsl 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/stylesheet.xsl 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,16 @@ + + + 1 + ansi + 80 + 0 + A4 + 2 + 1 + 1 + + + diff -Nur linux-4.2/Documentation/scsi/link_power_management_policy.txt linux-4.2-pck/Documentation/scsi/link_power_management_policy.txt --- linux-4.2/Documentation/scsi/link_power_management_policy.txt 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/Documentation/scsi/link_power_management_policy.txt 2015-09-08 00:48:22.211698004 -0300 @@ -1,8 +1,11 @@ This parameter allows the user to set the link (interface) power management. -There are 3 possible options: +There are 4 possible options: Value Effect ---------------------------------------------------------------------------- +firmware_defaults Inherit configuration from the state programmed by + the firmware during system init. + min_power Tell the controller to try to make the link use the least possible power when possible. This may sacrifice some performance due to increased latency diff -Nur linux-4.2/Documentation/tp_smapi.txt linux-4.2-pck/Documentation/tp_smapi.txt --- linux-4.2/Documentation/tp_smapi.txt 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/tp_smapi.txt 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,267 @@ +tp_smapi version 0.40 +IBM ThinkPad hardware functions driver + +Author: Shem Multinymous +Project: http://sourceforge.net/projects/tpctl +Wiki: http://thinkwiki.org/wiki/tp_smapi +List: linux-thinkpad@linux-thinkpad.org + (http://mailman.linux-thinkpad.org/mailman/listinfo/linux-thinkpad) + +Description +----------- + +ThinkPad laptops include a proprietary interface called SMAPI BIOS +(System Management Application Program Interface) which provides some +hardware control functionality that is not accessible by other means. + +This driver exposes some features of the SMAPI BIOS through a sysfs +interface. It is suitable for newer models, on which SMAPI is invoked +through IO port writes. Older models use a different SMAPI interface; +for those, try the "thinkpad" module from the "tpctl" package. + +WARNING: +This driver uses undocumented features and direct hardware access. +It thus cannot be guaranteed to work, and may cause arbitrary damage +(especially on models it wasn't tested on). + + +Module parameters +----------------- + +thinkpad_ec module: + force_io=1 lets thinkpad_ec load on some recent ThinkPad models + (e.g., T400 and T500) whose BIOS's ACPI DSDT reserves the ports we need. +tp_smapi module: + debug=1 enables verbose dmesg output. + + +Usage +----- + +Control of battery charging thresholds (in percents of current full charge +capacity): + +# echo 40 > /sys/devices/platform/smapi/BAT0/start_charge_thresh +# echo 70 > /sys/devices/platform/smapi/BAT0/stop_charge_thresh +# cat /sys/devices/platform/smapi/BAT0/*_charge_thresh + + (This is useful since Li-Ion batteries wear out much faster at very + high or low charge levels. The driver will also keeps the thresholds + across suspend-to-disk with AC disconnected; this isn't done + automatically by the hardware.) + +Inhibiting battery charging for 17 minutes (overrides thresholds): + +# echo 17 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes +# echo 0 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes # stop +# cat /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes + + (This can be used to control which battery is charged when using an + Ultrabay battery.) + +Forcing battery discharging even if AC power available: + +# echo 1 > /sys/devices/platform/smapi/BAT0/force_discharge # start discharge +# echo 0 > /sys/devices/platform/smapi/BAT0/force_discharge # stop discharge +# cat /sys/devices/platform/smapi/BAT0/force_discharge + + (When AC is connected, forced discharging will automatically stop + when battery is fully depleted -- this is useful for calibration. + Also, this attribute can be used to control which battery is discharged + when both a system battery and an Ultrabay battery are connected.) + +Misc read-only battery status attributes (see note about HDAPS below): + +/sys/devices/platform/smapi/BAT0/installed # 0 or 1 +/sys/devices/platform/smapi/BAT0/state # idle/charging/discharging +/sys/devices/platform/smapi/BAT0/cycle_count # integer counter +/sys/devices/platform/smapi/BAT0/current_now # instantaneous current +/sys/devices/platform/smapi/BAT0/current_avg # last minute average +/sys/devices/platform/smapi/BAT0/power_now # instantaneous power +/sys/devices/platform/smapi/BAT0/power_avg # last minute average +/sys/devices/platform/smapi/BAT0/last_full_capacity # in mWh +/sys/devices/platform/smapi/BAT0/remaining_percent # remaining percent of energy (set by calibration) +/sys/devices/platform/smapi/BAT0/remaining_percent_error # error range of remaing_percent (not reset by calibration) +/sys/devices/platform/smapi/BAT0/remaining_running_time # in minutes, by last minute average power +/sys/devices/platform/smapi/BAT0/remaining_running_time_now # in minutes, by instantenous power +/sys/devices/platform/smapi/BAT0/remaining_charging_time # in minutes +/sys/devices/platform/smapi/BAT0/remaining_capacity # in mWh +/sys/devices/platform/smapi/BAT0/design_capacity # in mWh +/sys/devices/platform/smapi/BAT0/voltage # in mV +/sys/devices/platform/smapi/BAT0/design_voltage # in mV +/sys/devices/platform/smapi/BAT0/charging_max_current # max charging current +/sys/devices/platform/smapi/BAT0/charging_max_voltage # max charging voltage +/sys/devices/platform/smapi/BAT0/group{0,1,2,3}_voltage # see below +/sys/devices/platform/smapi/BAT0/manufacturer # string +/sys/devices/platform/smapi/BAT0/model # string +/sys/devices/platform/smapi/BAT0/barcoding # string +/sys/devices/platform/smapi/BAT0/chemistry # string +/sys/devices/platform/smapi/BAT0/serial # integer +/sys/devices/platform/smapi/BAT0/manufacture_date # YYYY-MM-DD +/sys/devices/platform/smapi/BAT0/first_use_date # YYYY-MM-DD +/sys/devices/platform/smapi/BAT0/temperature # in milli-Celsius +/sys/devices/platform/smapi/BAT0/dump # see below +/sys/devices/platform/smapi/ac_connected # 0 or 1 + +The BAT0/group{0,1,2,3}_voltage attribute refers to the separate cell groups +in each battery. For example, on the ThinkPad 600, X3x, T4x and R5x models, +the battery contains 3 cell groups in series, where each group consisting of 2 +or 3 cells connected in parallel. The voltage of each group is given by these +attributes, and their sum (roughly) equals the "voltage" attribute. +(The effective performance of the battery is determined by the weakest group, +i.e., the one those voltage changes most rapidly during dis/charging.) + +The "BAT0/dump" attribute gives a a hex dump of the raw status data, which +contains additional data now in the above (if you can figure it out). Some +unused values are autodetected and replaced by "--": + +In all of the above, replace BAT0 with BAT1 to address the 2nd battery (e.g. +in the UltraBay). + + +Raw SMAPI calls: + +/sys/devices/platform/smapi/smapi_request +This performs raw SMAPI calls. It uses a bad interface that cannot handle +multiple simultaneous access. Don't touch it, it's for development only. +If you did touch it, you would so something like +# echo '211a 100 0 0' > /sys/devices/platform/smapi/smapi_request +# cat /sys/devices/platform/smapi/smapi_request +and notice that in the output "211a 34b b2 0 0 0 'OK'", the "4b" in the 2nd +value, converted to decimal is 75: the current charge stop threshold. + + +Model-specific status +--------------------- + +Works (at least partially) on the following ThinkPad model: +* A30 +* G41 +* R40, R50p, R51, R52 +* T23, T40, T40p, T41, T41p, T42, T42p, T43, T43p, T60 +* X24, X31, X32, X40, X41, X60 +* Z60t, Z61m + +Not all functions are available on all models; for detailed status, see: + http://thinkwiki.org/wiki/tp_smapi + +Please report success/failure by e-mail or on the Wiki. +If you get a "not implemented" or "not supported" message, your laptop +probably just can't do that (at least not via the SMAPI BIOS). +For negative reports, follow the bug reporting guidelines below. +If you send me the necessary technical data (i.e., SMAPI function +interfaces), I will support additional models. + + +Additional HDAPS features +------------------------- + +The modified hdaps driver has several improvements on the one in mainline +(beyond resolving the conflict with thinkpad_ec and tp_smapi): + +- Fixes reliability and improves support for recent ThinkPad models + (especially *60 and newer). Unlike the mainline driver, the modified hdaps + correctly follows the Embedded Controller communication protocol. + +- Extends the "invert" parameter to cover all possible axis orientations. + The possible values are as follows. + Let X,Y denote the hardware readouts. + Let R denote the laptop's roll (tilt left/right). + Let P denote the laptop's pitch (tilt forward/backward). + invert=0: R= X P= Y (same as mainline) + invert=1: R=-X P=-Y (same as mainline) + invert=2: R=-X P= Y (new) + invert=3: R= X P=-Y (new) + invert=4: R= Y P= X (new) + invert=5: R=-Y P=-X (new) + invert=6: R=-Y P= X (new) + invert=7: R= Y P=-X (new) + It's probably easiest to just try all 8 possibilities and see which yields + correct results (e.g., in the hdaps-gl visualisation). + +- Adds a whitelist which automatically sets the correct axis orientation for + some models. If the value for your model is wrong or missing, you can override + it using the "invert" parameter. Please also update the tables at + http://www.thinkwiki.org/wiki/tp_smapi and + http://www.thinkwiki.org/wiki/List_of_DMI_IDs + and submit a patch for the whitelist in hdaps.c. + +- Provides new attributes: + /sys/devices/platform/hdaps/sampling_rate: + This determines the frequency at which the host queries the embedded + controller for accelerometer data (and informs the hdaps input devices). + Default=50. + /sys/devices/platform/hdaps/oversampling_ratio: + When set to X, the embedded controller is told to do physical accelerometer + measurements at a rate that is X times higher than the rate at which + the driver reads those measurements (i.e., X*sampling_rate). This + makes the readouts from the embedded controller more fresh, and is also + useful for the running average filter (see next). Default=5 + /sys/devices/platform/hdaps/running_avg_filter_order: + When set to X, reported readouts will be the average of the last X physical + accelerometer measurements. Current firmware allows 1<=X<=8. Setting to a + high value decreases readout fluctuations. The averaging is handled by the + embedded controller, so no CPU resources are used. Higher values make the + readouts smoother, since it averages out both sensor noise (good) and abrupt + changes (bad). Default=2. + +- Provides a second input device, which publishes the raw accelerometer + measurements (without the fuzzing needed for joystick emulation). This input + device can be matched by a udev rule such as the following (all on one line): + KERNEL=="event[0-9]*", ATTRS{phys}=="hdaps/input1", + ATTRS{modalias}=="input:b0019v1014p5054e4801-*", + SYMLINK+="input/hdaps/accelerometer-event + +A new version of the hdapsd userspace daemon, which uses the input device +interface instead of polling sysfs, is available seprately. Using this reduces +the total interrupts per second generated by hdaps+hdapsd (on tickless kernels) +to 50, down from a value that fluctuates between 50 and 100. Set the +sampling_rate sysfs attribute to a lower value to further reduce interrupts, +at the expense of response latency. + +Licensing note: all my changes to the HDAPS driver are licensed under the +GPL version 2 or, at your option and to the extent allowed by derivation from +prior works, any later version. My version of hdaps is derived work from the +mainline version, which at the time of writing is available only under +GPL version 2. + +Bug reporting +------------- + +Mail . Please include: +* Details about your model, +* Relevant "dmesg" output. Make sure thinkpad_ec and tp_smapi are loaded with + the "debug=1" parameter (e.g., use "make load HDAPS=1 DEBUG=1"). +* Output of "dmidecode | grep -C5 Product" +* Does the failed functionality works under Windows? + + +More about SMAPI +---------------- + +For hints about what may be possible via the SMAPI BIOS and how, see: + +* IBM Technical Reference Manual for the ThinkPad 770 + (http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD) +* Exported symbols in PWRMGRIF.DLL or TPPWRW32.DLL (e.g., use "objdump -x"). +* drivers/char/mwave/smapi.c in the Linux kernel tree.* +* The "thinkpad" SMAPI module (http://tpctl.sourceforge.net). +* The SMAPI_* constants in tp_smapi.c. + +Note that in the above Technical Reference and in the "thinkpad" module, +SMAPI is invoked through a function call to some physical address. However, +the interface used by tp_smapi and the above mwave drive, and apparently +required by newer ThinkPad, is different: you set the parameters up in the +CPU's registers and write to ports 0xB2 (the APM control port) and 0x4F; this +triggers an SMI (System Management Interrupt), causing the CPU to enter +SMM (System Management Mode) and run the BIOS firmware; the results are +returned in the CPU's registers. It is not clear what is the relation between +the two variants of SMAPI, though the assignment of error codes seems to be +similar. + +In addition, the embedded controller on ThinkPad laptops has a non-standard +interface at IO ports 0x1600-0x161F (mapped to LCP channel 3 of the H8S chip). +The interface provides various system management services (currently known: +battery information and accelerometer readouts). For more information see the +thinkpad_ec module and the H8S hardware documentation: +http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf diff -Nur linux-4.2/Documentation/vm/00-INDEX linux-4.2-pck/Documentation/vm/00-INDEX --- linux-4.2/Documentation/vm/00-INDEX 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/Documentation/vm/00-INDEX 2015-09-08 00:51:03.460668141 -0300 @@ -16,6 +16,8 @@ - explains what hwpoison is ksm.txt - how to use the Kernel Samepage Merging feature. +uksm.txt + - Introduction to Ultra KSM numa - information about NUMA specific code in the Linux vm. numa_memory_policy.txt diff -Nur linux-4.2/Documentation/vm/uksm.txt linux-4.2-pck/Documentation/vm/uksm.txt --- linux-4.2/Documentation/vm/uksm.txt 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/vm/uksm.txt 2015-09-08 00:51:03.460668141 -0300 @@ -0,0 +1,59 @@ +The Ultra Kernel Samepage Merging feature +---------------------------------------------- +/* + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia + * + * This is an improvement upon KSM. Some basic data structures and routines + * are borrowed from ksm.c . + * + * Its new features: + * 1. Full system scan: + * It automatically scans all user processes' anonymous VMAs. Kernel-user + * interaction to submit a memory area to KSM is no longer needed. + * + * 2. Rich area detection: + * It automatically detects rich areas containing abundant duplicated + * pages based. Rich areas are given a full scan speed. Poor areas are + * sampled at a reasonable speed with very low CPU consumption. + * + * 3. Ultra Per-page scan speed improvement: + * A new hash algorithm is proposed. As a result, on a machine with + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it + * can scan memory areas that does not contain duplicated pages at speed of + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of + * 477MB/sec ~ 923MB/sec. + * + * 4. Thrashing area avoidance: + * Thrashing area(an VMA that has frequent Ksm page break-out) can be + * filtered out. My benchmark shows it's more efficient than KSM's per-page + * hash value based volatile page detection. + * + * + * 5. Misc changes upon KSM: + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page + * comparison. It's much faster than default C version on x86. + * * rmap_item now has an struct *page member to loosely cache a + * address-->page mapping, which reduces too much time-costly + * follow_page(). + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ + * ksm is needed for this case. + * + * 6. Full Zero Page consideration(contributed by Figo Zhang) + * Now uksmd consider full zero pages as special pages and merge them to an + * special unswappable uksm zero page. + */ + +ChangeLog: + +2012-05-05 The creation of this Doc +2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. +2012-05-28 UKSM 0.1.1.2 bug fix release +2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 +2012-07-2 UKSM 0.1.2-beta2 +2012-07-10 UKSM 0.1.2-beta3 +2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. +2012-10-13 UKSM 0.1.2.1 Bug fixes. +2012-12-31 UKSM 0.1.2.2 Minor bug fixes. +2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug". +2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings. diff -Nur linux-4.2/MAINTAINERS linux-4.2-pck/MAINTAINERS --- linux-4.2/MAINTAINERS 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/MAINTAINERS 2015-09-08 00:48:22.215031180 -0300 @@ -5807,6 +5807,19 @@ F: Documentation/kbuild/kconfig-language.txt F: scripts/kconfig/ +KDBUS +M: Greg Kroah-Hartman +M: Daniel Mack +M: David Herrmann +M: Djalal Harouni +L: linux-kernel@vger.kernel.org +S: Maintained +F: ipc/kdbus/* +F: samples/kdbus/* +F: Documentation/kdbus/* +F: include/uapi/linux/kdbus.h +F: tools/testing/selftests/kdbus/ + KDUMP M: Vivek Goyal M: Haren Myneni diff -Nur linux-4.2/arch/x86/Kconfig.cpu linux-4.2-pck/arch/x86/Kconfig.cpu --- linux-4.2/arch/x86/Kconfig.cpu 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/Kconfig.cpu 2015-09-08 00:50:52.507860491 -0300 @@ -137,9 +137,8 @@ -Paxville -Dempsey - config MK6 - bool "K6/K6-II/K6-III" + bool "AMD K6/K6-II/K6-III" depends on X86_32 ---help--- Select this for an AMD K6-family processor. Enables use of @@ -147,7 +146,7 @@ flags to GCC. config MK7 - bool "Athlon/Duron/K7" + bool "AMD Athlon/Duron/K7" depends on X86_32 ---help--- Select this for an AMD Athlon K7-family processor. Enables use of @@ -155,12 +154,62 @@ flags to GCC. config MK8 - bool "Opteron/Athlon64/Hammer/K8" + bool "AMD Opteron/Athlon64/Hammer/K8" ---help--- Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables use of some extended instructions, and passes appropriate optimization flags to GCC. +config MK8SSE3 + bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" + ---help--- + Select this for improved AMD Opteron or Athlon64 Hammer-family processors. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MK10 + bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" + ---help--- + Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, + Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MBARCELONA + bool "AMD Barcelona" + ---help--- + Select this for AMD Barcelona and newer processors. + + Enables -march=barcelona + +config MBOBCAT + bool "AMD Bobcat" + ---help--- + Select this for AMD Bobcat processors. + + Enables -march=btver1 + +config MBULLDOZER + bool "AMD Bulldozer" + ---help--- + Select this for AMD Bulldozer processors. + + Enables -march=bdver1 + +config MPILEDRIVER + bool "AMD Piledriver" + ---help--- + Select this for AMD Piledriver processors. + + Enables -march=bdver2 + +config MJAGUAR + bool "AMD Jaguar" + ---help--- + Select this for AMD Jaguar processors. + + Enables -march=btver2 + config MCRUSOE bool "Crusoe" depends on X86_32 @@ -251,8 +300,17 @@ using the cpu family field in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. +config MATOM + bool "Intel Atom" + ---help--- + + Select this for the Intel Atom platform. Intel Atom CPUs have an + in-order pipelining architecture and thus can benefit from + accordingly optimized code. Use a recent GCC with specific Atom + support in order to fully benefit from selecting this option. + config MCORE2 - bool "Core 2/newer Xeon" + bool "Intel Core 2" ---help--- Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and @@ -260,14 +318,63 @@ family in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) -config MATOM - bool "Intel Atom" + Enables -march=core2 + +config MNEHALEM + bool "Intel Nehalem" ---help--- - Select this for the Intel Atom platform. Intel Atom CPUs have an - in-order pipelining architecture and thus can benefit from - accordingly optimized code. Use a recent GCC with specific Atom - support in order to fully benefit from selecting this option. + Select this for 1st Gen Core processors in the Nehalem family. + + Enables -march=nehalem + +config MWESTMERE + bool "Intel Westmere" + ---help--- + + Select this for the Intel Westmere formerly Nehalem-C family. + + Enables -march=westmere + +config MSILVERMONT + bool "Intel Silvermont" + ---help--- + + Select this for the Intel Silvermont platform. + + Enables -march=silvermont + +config MSANDYBRIDGE + bool "Intel Sandy Bridge" + ---help--- + + Select this for 2nd Gen Core processors in the Sandy Bridge family. + + Enables -march=sandybridge + +config MIVYBRIDGE + bool "Intel Ivy Bridge" + ---help--- + + Select this for 3rd Gen Core processors in the Ivy Bridge family. + + Enables -march=ivybridge + +config MHASWELL + bool "Intel Haswell" + ---help--- + + Select this for 4th Gen Core processors in the Haswell family. + + Enables -march=haswell + +config MBROADWELL + bool "Intel Broadwell" + ---help--- + + Select this for 5th Gen Core processors in the Broadwell family. + + Enables -march=broadwell config GENERIC_CPU bool "Generic-x86-64" @@ -276,6 +383,19 @@ Generic x86-64 CPU. Run equally well on all x86-64 CPUs. +config MNATIVE + bool "Native optimizations autodetected by GCC" + ---help--- + + GCC 4.2 and above support -march=native, which automatically detects + the optimum settings to use based on your processor. -march=native + also detects and applies additional settings beyond -march specific + to your CPU, (eg. -msse4). Unless you have a specific reason not to + (e.g. distcc cross-compiling), you should probably be using + -march=native rather than anything listed below. + + Enables -march=native + endchoice config X86_GENERIC @@ -290,6 +410,16 @@ This is really intended for distributors who need more generic optimizations. +config X86_MARCH_NATIVE + bool "Use -march=native cflag (EXPERIMENTAL)" + help + Setting Y here, will result in passing the -march=native and + -mtune=native cflags to GCC while compiling the kernel, which + makes GCC check the CPU capabilities and use the best cflags + for your computer. + + Set Y here only if you use >=gcc-4.2.0. + # # Define implied options from the CPU selection here config X86_INTERNODE_CACHE_SHIFT @@ -300,7 +430,7 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU default "4" if MELAN || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX @@ -331,11 +461,11 @@ config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MNATIVE config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MATOM || MNATIVE config X86_USE_3DNOW def_bool y @@ -359,17 +489,17 @@ config X86_TSC def_bool y - depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MNATIVE || MATOM) || X86_64 config X86_CMPXCHG64 def_bool y - depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM + depends on X86_PAE || X86_64 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE # this should be set for all -march=.. options where the compiler # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) + depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) config X86_MINIMUM_CPU_FAMILY int diff -Nur linux-4.2/arch/x86/Makefile linux-4.2-pck/arch/x86/Makefile --- linux-4.2/arch/x86/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/Makefile 2015-09-08 00:50:52.507860491 -0300 @@ -94,13 +94,35 @@ KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) + cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) + cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8) + cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10) + cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona) + cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1) + cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1) + cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2) + cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) cflags-$(CONFIG_MCORE2) += \ - $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) - cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \ - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) + $(call cc-option,-march=core2,$(call cc-option,-mtune=core2)) + cflags-$(CONFIG_MNEHALEM) += \ + $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem)) + cflags-$(CONFIG_MWESTMERE) += \ + $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere)) + cflags-$(CONFIG_MSILVERMONT) += \ + $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont)) + cflags-$(CONFIG_MSANDYBRIDGE) += \ + $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge)) + cflags-$(CONFIG_MIVYBRIDGE) += \ + $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge)) + cflags-$(CONFIG_MHASWELL) += \ + $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell)) + cflags-$(CONFIG_MBROADWELL) += \ + $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell)) + cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \ + $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) KBUILD_CFLAGS += $(cflags-y) diff -Nur linux-4.2/arch/x86/Makefile_32.cpu linux-4.2-pck/arch/x86/Makefile_32.cpu --- linux-4.2/arch/x86/Makefile_32.cpu 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/Makefile_32.cpu 2015-09-08 00:50:52.507860491 -0300 @@ -10,6 +10,9 @@ endif align := $(cc-option-align) +ifeq ($(CONFIG_X86_MARCH_NATIVE),y) +cflags-y += -march=native +else cflags-$(CONFIG_M486) += -march=i486 cflags-$(CONFIG_M586) += -march=i586 cflags-$(CONFIG_M586TSC) += -march=i586 @@ -23,7 +26,15 @@ # Please note, that patches that add -march=athlon-xp and friends are pointless. # They make zero difference whatsosever to performance at this time. cflags-$(CONFIG_MK7) += -march=athlon +cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) +cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-march=athlon) +cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10,-march=athlon) +cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon) +cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon) +cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon) +cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon) +cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon) cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) @@ -32,8 +43,15 @@ cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) -cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) +cflags-$(CONFIG_MNEHALEM) += -march=i686 $(call tune,nehalem) +cflags-$(CONFIG_MWESTMERE) += -march=i686 $(call tune,westmere) +cflags-$(CONFIG_MSILVERMONT) += -march=i686 $(call tune,silvermont) +cflags-$(CONFIG_MSANDYBRIDGE) += -march=i686 $(call tune,sandybridge) +cflags-$(CONFIG_MIVYBRIDGE) += -march=i686 $(call tune,ivybridge) +cflags-$(CONFIG_MHASWELL) += -march=i686 $(call tune,haswell) +cflags-$(CONFIG_MBROADWELL) += -march=i686 $(call tune,broadwell) +cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \ + $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) # AMD Elan support cflags-$(CONFIG_MELAN) += -march=i486 @@ -41,6 +59,8 @@ # Geode GX1 support cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx) +endif + # add at the end to overwrite eventual tuning options from earlier # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) diff -Nur linux-4.2/arch/x86/include/asm/module.h linux-4.2-pck/arch/x86/include/asm/module.h --- linux-4.2/arch/x86/include/asm/module.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/include/asm/module.h 2015-09-08 00:50:52.507860491 -0300 @@ -15,6 +15,22 @@ #define MODULE_PROC_FAMILY "586MMX " #elif defined CONFIG_MCORE2 #define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_MNATIVE +#define MODULE_PROC_FAMILY "NATIVE " +#elif defined CONFIG_MNEHALEM +#define MODULE_PROC_FAMILY "NEHALEM " +#elif defined CONFIG_MWESTMERE +#define MODULE_PROC_FAMILY "WESTMERE " +#elif defined CONFIG_MSILVERMONT +#define MODULE_PROC_FAMILY "SILVERMONT " +#elif defined CONFIG_MSANDYBRIDGE +#define MODULE_PROC_FAMILY "SANDYBRIDGE " +#elif defined CONFIG_MIVYBRIDGE +#define MODULE_PROC_FAMILY "IVYBRIDGE " +#elif defined CONFIG_MHASWELL +#define MODULE_PROC_FAMILY "HASWELL " +#elif defined CONFIG_MBROADWELL +#define MODULE_PROC_FAMILY "BROADWELL " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -33,6 +49,20 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " +#elif defined CONFIG_MK8SSE3 +#define MODULE_PROC_FAMILY "K8SSE3 " +#elif defined CONFIG_MK10 +#define MODULE_PROC_FAMILY "K10 " +#elif defined CONFIG_MBARCELONA +#define MODULE_PROC_FAMILY "BARCELONA " +#elif defined CONFIG_MBOBCAT +#define MODULE_PROC_FAMILY "BOBCAT " +#elif defined CONFIG_MBULLDOZER +#define MODULE_PROC_FAMILY "BULLDOZER " +#elif defined CONFIG_MPILEDRIVER +#define MODULE_PROC_FAMILY "PILEDRIVER " +#elif defined CONFIG_MJAGUAR +#define MODULE_PROC_FAMILY "JAGUAR " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff -Nur linux-4.2/arch/x86/kernel/early_printk.c linux-4.2-pck/arch/x86/kernel/early_printk.c --- linux-4.2/arch/x86/kernel/early_printk.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/kernel/early_printk.c 2015-09-08 00:48:22.215031180 -0300 @@ -27,7 +27,8 @@ static int max_ypos = 25, max_xpos = 80; static int current_ypos = 25, current_xpos; -static void early_vga_write(struct console *con, const char *str, unsigned n) +static void early_vga_write(struct console *con, const char *str, unsigned n, + unsigned int loglevel) { char c; int i, k, j; @@ -118,7 +119,8 @@ return timeout ? 0 : -1; } -static void early_serial_write(struct console *con, const char *s, unsigned n) +static void early_serial_write(struct console *con, const char *s, unsigned n, + unsigned int loglevel) { while (*s && n-- > 0) { if (*s == '\n') diff -Nur linux-4.2/arch/x86/platform/efi/early_printk.c linux-4.2-pck/arch/x86/platform/efi/early_printk.c --- linux-4.2/arch/x86/platform/efi/early_printk.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/platform/efi/early_printk.c 2015-09-08 00:48:22.215031180 -0300 @@ -124,7 +124,8 @@ } static void -early_efi_write(struct console *con, const char *str, unsigned int num) +early_efi_write(struct console *con, const char *str, unsigned int num, + unsigned int loglevel) { struct screen_info *si; unsigned int len; diff -Nur linux-4.2/block/Kconfig.iosched linux-4.2-pck/block/Kconfig.iosched --- linux-4.2/block/Kconfig.iosched 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/block/Kconfig.iosched 2015-09-08 00:48:22.215031180 -0300 @@ -39,9 +39,30 @@ ---help--- Enable group IO scheduling in CFQ. +config IOSCHED_BFQ + tristate "BFQ I/O scheduler" + default y + ---help--- + The BFQ I/O scheduler tries to distribute bandwidth among + all processes according to their weights. + It aims at distributing the bandwidth as desired, independently of + the disk parameters and with any workload. It also tries to + guarantee low latency to interactive and soft real-time + applications. If compiled built-in (saying Y here), BFQ can + be configured to support hierarchical scheduling. + +config CGROUP_BFQIO + bool "BFQ hierarchical scheduling support" + depends on CGROUPS && IOSCHED_BFQ=y + default n + ---help--- + Enable hierarchical scheduling in BFQ, using the cgroups + filesystem interface. The name of the subsystem will be + bfqio. + choice prompt "Default I/O scheduler" - default DEFAULT_CFQ + default DEFAULT_BFQ help Select the I/O scheduler which will be used by default for all block devices. @@ -52,6 +73,16 @@ config DEFAULT_CFQ bool "CFQ" if IOSCHED_CFQ=y + config DEFAULT_BFQ + bool "BFQ" if IOSCHED_BFQ=y + help + Selects BFQ as the default I/O scheduler which will be + used by default for all block devices. + The BFQ I/O scheduler aims at distributing the bandwidth + as desired, independently of the disk parameters and with + any workload. It also tries to guarantee low latency to + interactive and soft real-time applications. + config DEFAULT_NOOP bool "No-op" @@ -61,6 +92,7 @@ string default "deadline" if DEFAULT_DEADLINE default "cfq" if DEFAULT_CFQ + default "bfq" if DEFAULT_BFQ default "noop" if DEFAULT_NOOP endmenu diff -Nur linux-4.2/block/Makefile linux-4.2-pck/block/Makefile --- linux-4.2/block/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/block/Makefile 2015-09-08 00:48:22.215031180 -0300 @@ -18,6 +18,7 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o diff -Nur linux-4.2/block/bfq-cgroup.c linux-4.2-pck/block/bfq-cgroup.c --- linux-4.2/block/bfq-cgroup.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/block/bfq-cgroup.c 2015-09-08 00:48:22.215031180 -0300 @@ -0,0 +1,936 @@ +/* + * BFQ: CGROUPS support. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. + */ + +#ifdef CONFIG_CGROUP_BFQIO + +static DEFINE_MUTEX(bfqio_mutex); + +static bool bfqio_is_removed(struct bfqio_cgroup *bgrp) +{ + return bgrp ? !bgrp->online : false; +} + +static struct bfqio_cgroup bfqio_root_cgroup = { + .weight = BFQ_DEFAULT_GRP_WEIGHT, + .ioprio = BFQ_DEFAULT_GRP_IOPRIO, + .ioprio_class = BFQ_DEFAULT_GRP_CLASS, +}; + +static inline void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) +{ + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class; + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; +} + +static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct bfqio_cgroup, css) : NULL; +} + +/* + * Search the bfq_group for bfqd into the hash table (by now only a list) + * of bgrp. Must be called under rcu_read_lock(). + */ +static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, + struct bfq_data *bfqd) +{ + struct bfq_group *bfqg; + void *key; + + hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) { + key = rcu_dereference(bfqg->bfqd); + if (key == bfqd) + return bfqg; + } + + return NULL; +} + +static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, + struct bfq_group *bfqg) +{ + struct bfq_entity *entity = &bfqg->entity; + + /* + * If the weight of the entity has never been set via the sysfs + * interface, then bgrp->weight == 0. In this case we initialize + * the weight from the current ioprio value. Otherwise, the group + * weight, if set, has priority over the ioprio value. + */ + if (bgrp->weight == 0) { + entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio); + entity->new_ioprio = bgrp->ioprio; + } else { + if (bgrp->weight < BFQ_MIN_WEIGHT || + bgrp->weight > BFQ_MAX_WEIGHT) { + printk(KERN_CRIT "bfq_group_init_entity: " + "bgrp->weight %d\n", bgrp->weight); + BUG(); + } + entity->new_weight = bgrp->weight; + entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight); + } + entity->orig_weight = entity->weight = entity->new_weight; + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; + entity->my_sched_data = &bfqg->sched_data; + bfqg->active_entities = 0; +} + +static inline void bfq_group_set_parent(struct bfq_group *bfqg, + struct bfq_group *parent) +{ + struct bfq_entity *entity; + + BUG_ON(parent == NULL); + BUG_ON(bfqg == NULL); + + entity = &bfqg->entity; + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; +} + +/** + * bfq_group_chain_alloc - allocate a chain of groups. + * @bfqd: queue descriptor. + * @css: the leaf cgroup_subsys_state this chain starts from. + * + * Allocate a chain of groups starting from the one belonging to + * @cgroup up to the root cgroup. Stop if a cgroup on the chain + * to the root has already an allocated group on @bfqd. + */ +static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, + struct cgroup_subsys_state *css) +{ + struct bfqio_cgroup *bgrp; + struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; + + for (; css != NULL; css = css->parent) { + bgrp = css_to_bfqio(css); + + bfqg = bfqio_lookup_group(bgrp, bfqd); + if (bfqg != NULL) { + /* + * All the cgroups in the path from there to the + * root must have a bfq_group for bfqd, so we don't + * need any more allocations. + */ + break; + } + + bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); + if (bfqg == NULL) + goto cleanup; + + bfq_group_init_entity(bgrp, bfqg); + bfqg->my_entity = &bfqg->entity; + + if (leaf == NULL) { + leaf = bfqg; + prev = leaf; + } else { + bfq_group_set_parent(prev, bfqg); + /* + * Build a list of allocated nodes using the bfqd + * filed, that is still unused and will be + * initialized only after the node will be + * connected. + */ + prev->bfqd = bfqg; + prev = bfqg; + } + } + + return leaf; + +cleanup: + while (leaf != NULL) { + prev = leaf; + leaf = leaf->bfqd; + kfree(prev); + } + + return NULL; +} + +/** + * bfq_group_chain_link - link an allocated group chain to a cgroup + * hierarchy. + * @bfqd: the queue descriptor. + * @css: the leaf cgroup_subsys_state to start from. + * @leaf: the leaf group (to be associated to @cgroup). + * + * Try to link a chain of groups to a cgroup hierarchy, connecting the + * nodes bottom-up, so we can be sure that when we find a cgroup in the + * hierarchy that already as a group associated to @bfqd all the nodes + * in the path to the root cgroup have one too. + * + * On locking: the queue lock protects the hierarchy (there is a hierarchy + * per device) while the bfqio_cgroup lock protects the list of groups + * belonging to the same cgroup. + */ +static void bfq_group_chain_link(struct bfq_data *bfqd, + struct cgroup_subsys_state *css, + struct bfq_group *leaf) +{ + struct bfqio_cgroup *bgrp; + struct bfq_group *bfqg, *next, *prev = NULL; + unsigned long flags; + + assert_spin_locked(bfqd->queue->queue_lock); + + for (; css != NULL && leaf != NULL; css = css->parent) { + bgrp = css_to_bfqio(css); + next = leaf->bfqd; + + bfqg = bfqio_lookup_group(bgrp, bfqd); + BUG_ON(bfqg != NULL); + + spin_lock_irqsave(&bgrp->lock, flags); + + rcu_assign_pointer(leaf->bfqd, bfqd); + hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); + hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); + + spin_unlock_irqrestore(&bgrp->lock, flags); + + prev = leaf; + leaf = next; + } + + BUG_ON(css == NULL && leaf != NULL); + if (css != NULL && prev != NULL) { + bgrp = css_to_bfqio(css); + bfqg = bfqio_lookup_group(bgrp, bfqd); + bfq_group_set_parent(prev, bfqg); + } +} + +/** + * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. + * @bfqd: queue descriptor. + * @cgroup: cgroup being searched for. + * + * Return a group associated to @bfqd in @cgroup, allocating one if + * necessary. When a group is returned all the cgroups in the path + * to the root have a group associated to @bfqd. + * + * If the allocation fails, return the root group: this breaks guarantees + * but is a safe fallback. If this loss becomes a problem it can be + * mitigated using the equivalent weight (given by the product of the + * weights of the groups in the path from @group to the root) in the + * root scheduler. + * + * We allocate all the missing nodes in the path from the leaf cgroup + * to the root and we connect the nodes only after all the allocations + * have been successful. + */ +static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, + struct cgroup_subsys_state *css) +{ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); + struct bfq_group *bfqg; + + bfqg = bfqio_lookup_group(bgrp, bfqd); + if (bfqg != NULL) + return bfqg; + + bfqg = bfq_group_chain_alloc(bfqd, css); + if (bfqg != NULL) + bfq_group_chain_link(bfqd, css, bfqg); + else + bfqg = bfqd->root_group; + + return bfqg; +} + +/** + * bfq_bfqq_move - migrate @bfqq to @bfqg. + * @bfqd: queue descriptor. + * @bfqq: the queue to move. + * @entity: @bfqq's entity. + * @bfqg: the group to move to. + * + * Move @bfqq to @bfqg, deactivating it from its old group and reactivating + * it on the new one. Avoid putting the entity on the old group idle tree. + * + * Must be called under the queue lock; the cgroup owning @bfqg must + * not disappear (by now this just means that we are called under + * rcu_read_lock()). + */ +static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_entity *entity, struct bfq_group *bfqg) +{ + int busy, resume; + + busy = bfq_bfqq_busy(bfqq); + resume = !RB_EMPTY_ROOT(&bfqq->sort_list); + + BUG_ON(resume && !entity->on_st); + BUG_ON(busy && !resume && entity->on_st && + bfqq != bfqd->in_service_queue); + + if (busy) { + BUG_ON(atomic_read(&bfqq->ref) < 2); + + if (!resume) + bfq_del_bfqq_busy(bfqd, bfqq, 0); + else + bfq_deactivate_bfqq(bfqd, bfqq, 0); + } else if (entity->on_st) + bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + + /* + * Here we use a reference to bfqg. We don't need a refcounter + * as the cgroup reference will not be dropped, so that its + * destroy() callback will not be invoked. + */ + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; + + if (busy && resume) + bfq_activate_bfqq(bfqd, bfqq); + + if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); +} + +/** + * __bfq_bic_change_cgroup - move @bic to @cgroup. + * @bfqd: the queue descriptor. + * @bic: the bic to move. + * @cgroup: the cgroup to move to. + * + * Move bic to cgroup, assuming that bfqd->queue is locked; the caller + * has to make sure that the reference to cgroup is valid across the call. + * + * NOTE: an alternative approach might have been to store the current + * cgroup in bfqq and getting a reference to it, reducing the lookup + * time here, at the price of slightly more complex code. + */ +static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct cgroup_subsys_state *css) +{ + struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); + struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); + struct bfq_entity *entity; + struct bfq_group *bfqg; + struct bfqio_cgroup *bgrp; + + bgrp = css_to_bfqio(css); + + bfqg = bfq_find_alloc_group(bfqd, css); + if (async_bfqq != NULL) { + entity = &async_bfqq->entity; + + if (entity->sched_data != &bfqg->sched_data) { + bic_set_bfqq(bic, NULL, 0); + bfq_log_bfqq(bfqd, async_bfqq, + "bic_change_group: %p %d", + async_bfqq, atomic_read(&async_bfqq->ref)); + bfq_put_queue(async_bfqq); + } + } + + if (sync_bfqq != NULL) { + entity = &sync_bfqq->entity; + if (entity->sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); + } + + return bfqg; +} + +/** + * bfq_bic_change_cgroup - move @bic to @cgroup. + * @bic: the bic being migrated. + * @cgroup: the destination cgroup. + * + * When the task owning @bic is moved to @cgroup, @bic is immediately + * moved into its new parent group. + */ +static void bfq_bic_change_cgroup(struct bfq_io_cq *bic, + struct cgroup_subsys_state *css) +{ + struct bfq_data *bfqd; + unsigned long uninitialized_var(flags); + + bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), + &flags); + if (bfqd != NULL) { + __bfq_bic_change_cgroup(bfqd, bic, css); + bfq_put_bfqd_unlock(bfqd, &flags); + } +} + +/** + * bfq_bic_update_cgroup - update the cgroup of @bic. + * @bic: the @bic to update. + * + * Make sure that @bic is enqueued in the cgroup of the current task. + * We need this in addition to moving bics during the cgroup attach + * phase because the task owning @bic could be at its first disk + * access or we may end up in the root cgroup as the result of a + * memory allocation failure and here we try to move to the right + * group. + * + * Must be called under the queue lock. It is safe to use the returned + * value even after the rcu_read_unlock() as the migration/destruction + * paths act under the queue lock too. IOW it is impossible to race with + * group migration/destruction and end up with an invalid group as: + * a) here cgroup has not yet been destroyed, nor its destroy callback + * has started execution, as current holds a reference to it, + * b) if it is destroyed after rcu_read_unlock() [after current is + * migrated to a different cgroup] its attach() callback will have + * taken care of remove all the references to the old cgroup data. + */ +static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic) +{ + struct bfq_data *bfqd = bic_to_bfqd(bic); + struct bfq_group *bfqg; + struct cgroup_subsys_state *css; + + BUG_ON(bfqd == NULL); + + rcu_read_lock(); + css = task_css(current, bfqio_cgrp_id); + bfqg = __bfq_bic_change_cgroup(bfqd, bic, css); + rcu_read_unlock(); + + return bfqg; +} + +/** + * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. + * @st: the service tree being flushed. + */ +static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) +{ + struct bfq_entity *entity = st->first_idle; + + for (; entity != NULL; entity = st->first_idle) + __bfq_deactivate_entity(entity, 0); +} + +/** + * bfq_reparent_leaf_entity - move leaf entity to the root_group. + * @bfqd: the device data structure with the root group. + * @entity: the entity to move. + */ +static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(bfqq == NULL); + bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); + return; +} + +/** + * bfq_reparent_active_entities - move to the root group all active + * entities. + * @bfqd: the device data structure with the root group. + * @bfqg: the group to move from. + * @st: the service tree with the entities. + * + * Needs queue_lock to be taken and reference to be valid over the call. + */ +static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, + struct bfq_group *bfqg, + struct bfq_service_tree *st) +{ + struct rb_root *active = &st->active; + struct bfq_entity *entity = NULL; + + if (!RB_EMPTY_ROOT(&st->active)) + entity = bfq_entity_of(rb_first(active)); + + for (; entity != NULL; entity = bfq_entity_of(rb_first(active))) + bfq_reparent_leaf_entity(bfqd, entity); + + if (bfqg->sched_data.in_service_entity != NULL) + bfq_reparent_leaf_entity(bfqd, + bfqg->sched_data.in_service_entity); + + return; +} + +/** + * bfq_destroy_group - destroy @bfqg. + * @bgrp: the bfqio_cgroup containing @bfqg. + * @bfqg: the group being destroyed. + * + * Destroy @bfqg, making sure that it is not referenced from its parent. + */ +static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) +{ + struct bfq_data *bfqd; + struct bfq_service_tree *st; + struct bfq_entity *entity = bfqg->my_entity; + unsigned long uninitialized_var(flags); + int i; + + hlist_del(&bfqg->group_node); + + /* + * Empty all service_trees belonging to this group before + * deactivating the group itself. + */ + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { + st = bfqg->sched_data.service_tree + i; + + /* + * The idle tree may still contain bfq_queues belonging + * to exited task because they never migrated to a different + * cgroup from the one being destroyed now. No one else + * can access them so it's safe to act without any lock. + */ + bfq_flush_idle_tree(st); + + /* + * It may happen that some queues are still active + * (busy) upon group destruction (if the corresponding + * processes have been forced to terminate). We move + * all the leaf entities corresponding to these queues + * to the root_group. + * Also, it may happen that the group has an entity + * in service, which is disconnected from the active + * tree: it must be moved, too. + * There is no need to put the sync queues, as the + * scheduler has taken no reference. + */ + bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); + if (bfqd != NULL) { + bfq_reparent_active_entities(bfqd, bfqg, st); + bfq_put_bfqd_unlock(bfqd, &flags); + } + BUG_ON(!RB_EMPTY_ROOT(&st->active)); + BUG_ON(!RB_EMPTY_ROOT(&st->idle)); + } + BUG_ON(bfqg->sched_data.next_in_service != NULL); + BUG_ON(bfqg->sched_data.in_service_entity != NULL); + + /* + * We may race with device destruction, take extra care when + * dereferencing bfqg->bfqd. + */ + bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); + if (bfqd != NULL) { + hlist_del(&bfqg->bfqd_node); + __bfq_deactivate_entity(entity, 0); + bfq_put_async_queues(bfqd, bfqg); + bfq_put_bfqd_unlock(bfqd, &flags); + } + BUG_ON(entity->tree != NULL); + + /* + * No need to defer the kfree() to the end of the RCU grace + * period: we are called from the destroy() callback of our + * cgroup, so we can be sure that no one is a) still using + * this cgroup or b) doing lookups in it. + */ + kfree(bfqg); +} + +static void bfq_end_wr_async(struct bfq_data *bfqd) +{ + struct hlist_node *tmp; + struct bfq_group *bfqg; + + hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) + bfq_end_wr_async_queues(bfqd, bfqg); + bfq_end_wr_async_queues(bfqd, bfqd->root_group); +} + +/** + * bfq_disconnect_groups - disconnect @bfqd from all its groups. + * @bfqd: the device descriptor being exited. + * + * When the device exits we just make sure that no lookup can return + * the now unused group structures. They will be deallocated on cgroup + * destruction. + */ +static void bfq_disconnect_groups(struct bfq_data *bfqd) +{ + struct hlist_node *tmp; + struct bfq_group *bfqg; + + bfq_log(bfqd, "disconnect_groups beginning"); + hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) { + hlist_del(&bfqg->bfqd_node); + + __bfq_deactivate_entity(bfqg->my_entity, 0); + + /* + * Don't remove from the group hash, just set an + * invalid key. No lookups can race with the + * assignment as bfqd is being destroyed; this + * implies also that new elements cannot be added + * to the list. + */ + rcu_assign_pointer(bfqg->bfqd, NULL); + + bfq_log(bfqd, "disconnect_groups: put async for group %p", + bfqg); + bfq_put_async_queues(bfqd, bfqg); + } +} + +static inline void bfq_free_root_group(struct bfq_data *bfqd) +{ + struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; + struct bfq_group *bfqg = bfqd->root_group; + + bfq_put_async_queues(bfqd, bfqg); + + spin_lock_irq(&bgrp->lock); + hlist_del_rcu(&bfqg->group_node); + spin_unlock_irq(&bgrp->lock); + + /* + * No need to synchronize_rcu() here: since the device is gone + * there cannot be any read-side access to its root_group. + */ + kfree(bfqg); +} + +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) +{ + struct bfq_group *bfqg; + struct bfqio_cgroup *bgrp; + int i; + + bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node); + if (bfqg == NULL) + return NULL; + + bfqg->entity.parent = NULL; + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + + bgrp = &bfqio_root_cgroup; + spin_lock_irq(&bgrp->lock); + rcu_assign_pointer(bfqg->bfqd, bfqd); + hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); + spin_unlock_irq(&bgrp->lock); + + return bfqg; +} + +#define SHOW_FUNCTION(__VAR) \ +static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \ + struct cftype *cftype) \ +{ \ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ + u64 ret = -ENODEV; \ + \ + mutex_lock(&bfqio_mutex); \ + if (bfqio_is_removed(bgrp)) \ + goto out_unlock; \ + \ + spin_lock_irq(&bgrp->lock); \ + ret = bgrp->__VAR; \ + spin_unlock_irq(&bgrp->lock); \ + \ +out_unlock: \ + mutex_unlock(&bfqio_mutex); \ + return ret; \ +} + +SHOW_FUNCTION(weight); +SHOW_FUNCTION(ioprio); +SHOW_FUNCTION(ioprio_class); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ +static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\ + struct cftype *cftype, \ + u64 val) \ +{ \ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ + struct bfq_group *bfqg; \ + int ret = -EINVAL; \ + \ + if (val < (__MIN) || val > (__MAX)) \ + return ret; \ + \ + ret = -ENODEV; \ + mutex_lock(&bfqio_mutex); \ + if (bfqio_is_removed(bgrp)) \ + goto out_unlock; \ + ret = 0; \ + \ + spin_lock_irq(&bgrp->lock); \ + bgrp->__VAR = (unsigned short)val; \ + hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \ + /* \ + * Setting the ioprio_changed flag of the entity \ + * to 1 with new_##__VAR == ##__VAR would re-set \ + * the value of the weight to its ioprio mapping. \ + * Set the flag only if necessary. \ + */ \ + if ((unsigned short)val != bfqg->entity.new_##__VAR) { \ + bfqg->entity.new_##__VAR = (unsigned short)val; \ + /* \ + * Make sure that the above new value has been \ + * stored in bfqg->entity.new_##__VAR before \ + * setting the ioprio_changed flag. In fact, \ + * this flag may be read asynchronously (in \ + * critical sections protected by a different \ + * lock than that held here), and finding this \ + * flag set may cause the execution of the code \ + * for updating parameters whose value may \ + * depend also on bfqg->entity.new_##__VAR (in \ + * __bfq_entity_update_weight_prio). \ + * This barrier makes sure that the new value \ + * of bfqg->entity.new_##__VAR is correctly \ + * seen in that code. \ + */ \ + smp_wmb(); \ + bfqg->entity.ioprio_changed = 1; \ + } \ + } \ + spin_unlock_irq(&bgrp->lock); \ + \ +out_unlock: \ + mutex_unlock(&bfqio_mutex); \ + return ret; \ +} + +STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); +STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); +STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); +#undef STORE_FUNCTION + +static struct cftype bfqio_files[] = { + { + .name = "weight", + .read_u64 = bfqio_cgroup_weight_read, + .write_u64 = bfqio_cgroup_weight_write, + }, + { + .name = "ioprio", + .read_u64 = bfqio_cgroup_ioprio_read, + .write_u64 = bfqio_cgroup_ioprio_write, + }, + { + .name = "ioprio_class", + .read_u64 = bfqio_cgroup_ioprio_class_read, + .write_u64 = bfqio_cgroup_ioprio_class_write, + }, + { }, /* terminate */ +}; + +static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state + *parent_css) +{ + struct bfqio_cgroup *bgrp; + + if (parent_css != NULL) { + bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); + if (bgrp == NULL) + return ERR_PTR(-ENOMEM); + } else + bgrp = &bfqio_root_cgroup; + + spin_lock_init(&bgrp->lock); + INIT_HLIST_HEAD(&bgrp->group_data); + bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; + bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; + + return &bgrp->css; +} + +/* + * We cannot support shared io contexts, as we have no means to support + * two tasks with the same ioc in two different groups without major rework + * of the main bic/bfqq data structures. By now we allow a task to change + * its cgroup only if it's the only owner of its ioc; the drawback of this + * behavior is that a group containing a task that forked using CLONE_IO + * will not be destroyed until the tasks sharing the ioc die. + */ +static int bfqio_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct io_context *ioc; + int ret = 0; + + cgroup_taskset_for_each(task, tset) { + /* + * task_lock() is needed to avoid races with + * exit_io_context() + */ + task_lock(task); + ioc = task->io_context; + if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) + /* + * ioc == NULL means that the task is either too + * young or exiting: if it has still no ioc the + * ioc can't be shared, if the task is exiting the + * attach will fail anyway, no matter what we + * return here. + */ + ret = -EINVAL; + task_unlock(task); + if (ret) + break; + } + + return ret; +} + +static void bfqio_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct io_context *ioc; + struct io_cq *icq; + + /* + * IMPORTANT NOTE: The move of more than one process at a time to a + * new group has not yet been tested. + */ + cgroup_taskset_for_each(task, tset) { + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { + /* + * Handle cgroup change here. + */ + rcu_read_lock(); + hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node) + if (!strncmp( + icq->q->elevator->type->elevator_name, + "bfq", ELV_NAME_MAX)) + bfq_bic_change_cgroup(icq_to_bic(icq), + css); + rcu_read_unlock(); + put_io_context(ioc); + } + } +} + +static void bfqio_destroy(struct cgroup_subsys_state *css) +{ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); + struct hlist_node *tmp; + struct bfq_group *bfqg; + + /* + * Since we are destroying the cgroup, there are no more tasks + * referencing it, and all the RCU grace periods that may have + * referenced it are ended (as the destruction of the parent + * cgroup is RCU-safe); bgrp->group_data will not be accessed by + * anything else and we don't need any synchronization. + */ + hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node) + bfq_destroy_group(bgrp, bfqg); + + BUG_ON(!hlist_empty(&bgrp->group_data)); + + kfree(bgrp); +} + +static int bfqio_css_online(struct cgroup_subsys_state *css) +{ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); + + mutex_lock(&bfqio_mutex); + bgrp->online = true; + mutex_unlock(&bfqio_mutex); + + return 0; +} + +static void bfqio_css_offline(struct cgroup_subsys_state *css) +{ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); + + mutex_lock(&bfqio_mutex); + bgrp->online = false; + mutex_unlock(&bfqio_mutex); +} + +struct cgroup_subsys bfqio_cgrp_subsys = { + .css_alloc = bfqio_create, + .css_online = bfqio_css_online, + .css_offline = bfqio_css_offline, + .can_attach = bfqio_can_attach, + .attach = bfqio_attach, + .css_free = bfqio_destroy, + .legacy_cftypes = bfqio_files, +}; +#else +static inline void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) +{ + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class; + entity->sched_data = &bfqg->sched_data; +} + +static inline struct bfq_group * +bfq_bic_update_cgroup(struct bfq_io_cq *bic) +{ + struct bfq_data *bfqd = bic_to_bfqd(bic); + return bfqd->root_group; +} + +static inline void bfq_bfqq_move(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_entity *entity, + struct bfq_group *bfqg) +{ +} + +static void bfq_end_wr_async(struct bfq_data *bfqd) +{ + bfq_end_wr_async_queues(bfqd, bfqd->root_group); +} + +static inline void bfq_disconnect_groups(struct bfq_data *bfqd) +{ + bfq_put_async_queues(bfqd, bfqd->root_group); +} + +static inline void bfq_free_root_group(struct bfq_data *bfqd) +{ + kfree(bfqd->root_group); +} + +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) +{ + struct bfq_group *bfqg; + int i; + + bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); + if (bfqg == NULL) + return NULL; + + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + + return bfqg; +} +#endif diff -Nur linux-4.2/block/bfq-ioc.c linux-4.2-pck/block/bfq-ioc.c --- linux-4.2/block/bfq-ioc.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/block/bfq-ioc.c 2015-09-08 00:48:22.215031180 -0300 @@ -0,0 +1,36 @@ +/* + * BFQ: I/O context handling. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + */ + +/** + * icq_to_bic - convert iocontext queue structure to bfq_io_cq. + * @icq: the iocontext queue. + */ +static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq) +{ + /* bic->icq is the first member, %NULL will convert to %NULL */ + return container_of(icq, struct bfq_io_cq, icq); +} + +/** + * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. + * @bfqd: the lookup key. + * @ioc: the io_context of the process doing I/O. + * + * Queue lock must be held. + */ +static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + struct io_context *ioc) +{ + if (ioc) + return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); + return NULL; +} diff -Nur linux-4.2/block/bfq-iosched.c linux-4.2-pck/block/bfq-iosched.c --- linux-4.2/block/bfq-iosched.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/block/bfq-iosched.c 2015-09-08 00:48:22.218364355 -0300 @@ -0,0 +1,4218 @@ +/* + * Budget Fair Queueing (BFQ) disk scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. + * + * BFQ is a proportional-share storage-I/O scheduling algorithm based on + * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, + * measured in number of sectors, to processes instead of time slices. The + * device is not granted to the in-service process for a given time slice, + * but until it has exhausted its assigned budget. This change from the time + * to the service domain allows BFQ to distribute the device throughput + * among processes as desired, without any distortion due to ZBR, workload + * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, + * called B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated to processes. Thanks to the + * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to + * I/O-bound processes issuing sequential requests (to boost the + * throughput), and yet guarantee a low latency to interactive and soft + * real-time applications. + * + * BFQ is described in [1], where also a reference to the initial, more + * theoretical paper on BFQ can be found. The interested reader can find + * in the latter paper full details on the main algorithm, as well as + * formulas of the guarantees and formal proofs of all the properties. + * With respect to the version of BFQ presented in these papers, this + * implementation adds a few more heuristics, such as the one that + * guarantees a low latency to soft real-time applications, and a + * hierarchical extension based on H-WF2Q+. + * + * B-WF2Q+ is based on WF2Q+, that is described in [2], together with + * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) + * complexity derives from the one introduced with EEVDF in [3]. + * + * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness + * with the BFQ Disk I/O Scheduler'', + * Proceedings of the 5th Annual International Systems and Storage + * Conference (SYSTOR '12), June 2012. + * + * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf + * + * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing + * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, + * Oct 1997. + * + * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz + * + * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline + * First: A Flexible and Accurate Mechanism for Proportional Share + * Resource Allocation,'' technical report. + * + * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "bfq.h" +#include "blk.h" + +/* Expiration time of sync (0) and async (1) requests, in jiffies. */ +static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; + +/* Maximum backwards seek, in KiB. */ +static const int bfq_back_max = 16 * 1024; + +/* Penalty of a backwards seek, in number of sectors. */ +static const int bfq_back_penalty = 2; + +/* Idling period duration, in jiffies. */ +static int bfq_slice_idle = HZ / 125; + +/* Default maximum budget values, in sectors and number of requests. */ +static const int bfq_default_max_budget = 16 * 1024; +static const int bfq_max_budget_async_rq = 4; + +/* + * Async to sync throughput distribution is controlled as follows: + * when an async request is served, the entity is charged the number + * of sectors of the request, multiplied by the factor below + */ +static const int bfq_async_charge_factor = 10; + +/* Default timeout values, in jiffies, approximating CFQ defaults. */ +static const int bfq_timeout_sync = HZ / 8; +static int bfq_timeout_async = HZ / 25; + +struct kmem_cache *bfq_pool; + +/* Below this threshold (in ms), we consider thinktime immediate. */ +#define BFQ_MIN_TT 2 + +/* hw_tag detection: parallel requests threshold and min samples needed. */ +#define BFQ_HW_QUEUE_THRESHOLD 4 +#define BFQ_HW_QUEUE_SAMPLES 32 + +#define BFQQ_SEEK_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) + +/* Min samples used for peak rate estimation (for autotuning). */ +#define BFQ_PEAK_RATE_SAMPLES 32 + +/* Shift used for peak rate fixed precision calculations. */ +#define BFQ_RATE_SHIFT 16 + +/* + * By default, BFQ computes the duration of the weight raising for + * interactive applications automatically, using the following formula: + * duration = (R / r) * T, where r is the peak rate of the device, and + * R and T are two reference parameters. + * In particular, R is the peak rate of the reference device (see below), + * and T is a reference time: given the systems that are likely to be + * installed on the reference device according to its speed class, T is + * about the maximum time needed, under BFQ and while reading two files in + * parallel, to load typical large applications on these systems. + * In practice, the slower/faster the device at hand is, the more/less it + * takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to interactive + * applications. + * + * BFQ uses four different reference pairs (R, T), depending on: + * . whether the device is rotational or non-rotational; + * . whether the device is slow, such as old or portable HDDs, as well as + * SD cards, or fast, such as newer HDDs and SSDs. + * + * The device's speed class is dynamically (re)detected in + * bfq_update_peak_rate() every time the estimated peak rate is updated. + * + * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] + * are the reference values for a slow/fast rotational device, whereas + * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for + * a slow/fast non-rotational device. Finally, device_speed_thresh are the + * thresholds used to switch between speed classes. + * Both the reference peak rates and the thresholds are measured in + * sectors/usec, left-shifted by BFQ_RATE_SHIFT. + */ +static int R_slow[2] = {1536, 10752}; +static int R_fast[2] = {17415, 34791}; +/* + * To improve readability, a conversion function is used to initialize the + * following arrays, which entails that they can be initialized only in a + * function. + */ +static int T_slow[2]; +static int T_fast[2]; +static int device_speed_thresh[2]; + +#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) +#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) + +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd); + +#include "bfq-ioc.c" +#include "bfq-sched.c" +#include "bfq-cgroup.c" + +#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_IDLE) +#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_RT) + +#define bfq_sample_valid(samples) ((samples) > 80) + +/* + * The following macro groups conditions that need to be evaluated when + * checking if existing queues and groups form a symmetric scenario + * and therefore idling can be reduced or disabled for some of the + * queues. See the comment to the function bfq_bfqq_must_not_expire() + * for further details. + */ +#ifdef CONFIG_CGROUP_BFQIO +#define symmetric_scenario (!bfqd->active_numerous_groups && \ + !bfq_differentiated_weights(bfqd)) +#else +#define symmetric_scenario (!bfq_differentiated_weights(bfqd)) +#endif + +/* + * We regard a request as SYNC, if either it's a read or has the SYNC bit + * set (in which case it could also be a direct WRITE). + */ +static inline int bfq_bio_sync(struct bio *bio) +{ + if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) + return 1; + + return 0; +} + +/* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. + */ +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) +{ + if (bfqd->queued != 0) { + bfq_log(bfqd, "schedule dispatch"); + kblockd_schedule_work(&bfqd->unplug_work); + } +} + +/* + * Lifted from AS - choose which of rq1 and rq2 that is best served now. + * We choose the request that is closesr to the head right now. Distance + * behind the head is penalized and only allowed to a certain extent. + */ +static struct request *bfq_choose_req(struct bfq_data *bfqd, + struct request *rq1, + struct request *rq2, + sector_t last) +{ + sector_t s1, s2, d1 = 0, d2 = 0; + unsigned long back_max; +#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ +#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ + unsigned wrap = 0; /* bit mask: requests behind the disk head? */ + + if (rq1 == NULL || rq1 == rq2) + return rq2; + if (rq2 == NULL) + return rq1; + + if (rq_is_sync(rq1) && !rq_is_sync(rq2)) + return rq1; + else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) + return rq2; + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) + return rq1; + else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) + return rq2; + + s1 = blk_rq_pos(rq1); + s2 = blk_rq_pos(rq2); + + /* + * By definition, 1KiB is 2 sectors. + */ + back_max = bfqd->bfq_back_max * 2; + + /* + * Strict one way elevator _except_ in the case where we allow + * short backward seeks which are biased as twice the cost of a + * similar forward seek. + */ + if (s1 >= last) + d1 = s1 - last; + else if (s1 + back_max >= last) + d1 = (last - s1) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ1_WRAP; + + if (s2 >= last) + d2 = s2 - last; + else if (s2 + back_max >= last) + d2 = (last - s2) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ2_WRAP; + + /* Found required data */ + + /* + * By doing switch() on the bit mask "wrap" we avoid having to + * check two variables for all permutations: --> faster! + */ + switch (wrap) { + case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ + if (d1 < d2) + return rq1; + else if (d2 < d1) + return rq2; + else { + if (s1 >= s2) + return rq1; + else + return rq2; + } + + case BFQ_RQ2_WRAP: + return rq1; + case BFQ_RQ1_WRAP: + return rq2; + case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ + default: + /* + * Since both rqs are wrapped, + * start with the one that's further behind head + * (--> only *one* back seek required), + * since back seek takes more time than forward. + */ + if (s1 <= s2) + return rq1; + else + return rq2; + } +} + +static struct bfq_queue * +bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + sector_t sector, struct rb_node **ret_parent, + struct rb_node ***rb_link) +{ + struct rb_node **p, *parent; + struct bfq_queue *bfqq = NULL; + + parent = NULL; + p = &root->rb_node; + while (*p) { + struct rb_node **n; + + parent = *p; + bfqq = rb_entry(parent, struct bfq_queue, pos_node); + + /* + * Sort strictly based on sector. Smallest to the left, + * largest to the right. + */ + if (sector > blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_right; + else if (sector < blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_left; + else + break; + p = n; + bfqq = NULL; + } + + *ret_parent = parent; + if (rb_link) + *rb_link = p; + + bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", + (long long unsigned)sector, + bfqq != NULL ? bfqq->pid : 0); + + return bfqq; +} + +static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct rb_node **p, *parent; + struct bfq_queue *__bfqq; + + if (bfqq->pos_root != NULL) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + + if (bfq_class_idle(bfqq)) + return; + if (!bfqq->next_rq) + return; + + bfqq->pos_root = &bfqd->rq_pos_tree; + __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, + blk_rq_pos(bfqq->next_rq), &parent, &p); + if (__bfqq == NULL) { + rb_link_node(&bfqq->pos_node, parent, p); + rb_insert_color(&bfqq->pos_node, bfqq->pos_root); + } else + bfqq->pos_root = NULL; +} + +/* + * Tell whether there are active queues or groups with differentiated weights. + */ +static inline bool bfq_differentiated_weights(struct bfq_data *bfqd) +{ + /* + * For weights to differ, at least one of the trees must contain + * at least two nodes. + */ + return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && + (bfqd->queue_weights_tree.rb_node->rb_left || + bfqd->queue_weights_tree.rb_node->rb_right) +#ifdef CONFIG_CGROUP_BFQIO + ) || + (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && + (bfqd->group_weights_tree.rb_node->rb_left || + bfqd->group_weights_tree.rb_node->rb_right) +#endif + ); +} + +/* + * If the weight-counter tree passed as input contains no counter for + * the weight of the input entity, then add that counter; otherwise just + * increment the existing counter. + * + * Note that weight-counter trees contain few nodes in mostly symmetric + * scenarios. For example, if all queues have the same weight, then the + * weight-counter tree for the queues may contain at most one node. + * This holds even if low_latency is on, because weight-raised queues + * are not inserted in the tree. + * In most scenarios, the rate at which nodes are created/destroyed + * should be low too. + */ +static void bfq_weights_tree_add(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* + * Do not insert if the entity is already associated with a + * counter, which happens if: + * 1) the entity is associated with a queue, + * 2) a request arrival has caused the queue to become both + * non-weight-raised, and hence change its weight, and + * backlogged; in this respect, each of the two events + * causes an invocation of this function, + * 3) this is the invocation of this function caused by the + * second event. This second invocation is actually useless, + * and we handle this fact by exiting immediately. More + * efficient or clearer solutions might possibly be adopted. + */ + if (entity->weight_counter) + return; + + while (*new) { + struct bfq_weight_counter *__counter = container_of(*new, + struct bfq_weight_counter, + weights_node); + parent = *new; + + if (entity->weight == __counter->weight) { + entity->weight_counter = __counter; + goto inc_counter; + } + if (entity->weight < __counter->weight) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), + GFP_ATOMIC); + entity->weight_counter->weight = entity->weight; + rb_link_node(&entity->weight_counter->weights_node, parent, new); + rb_insert_color(&entity->weight_counter->weights_node, root); + +inc_counter: + entity->weight_counter->num_active++; +} + +/* + * Decrement the weight counter associated with the entity, and, if the + * counter reaches 0, remove the counter from the tree. + * See the comments to the function bfq_weights_tree_add() for considerations + * about overhead. + */ +static void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + if (!entity->weight_counter) + return; + + BUG_ON(RB_EMPTY_ROOT(root)); + BUG_ON(entity->weight_counter->weight != entity->weight); + + BUG_ON(!entity->weight_counter->num_active); + entity->weight_counter->num_active--; + if (entity->weight_counter->num_active > 0) + goto reset_entity_pointer; + + rb_erase(&entity->weight_counter->weights_node, root); + kfree(entity->weight_counter); + +reset_entity_pointer: + entity->weight_counter = NULL; +} + +static struct request *bfq_find_next_rq(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *last) +{ + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct request *next = NULL, *prev = NULL; + + BUG_ON(RB_EMPTY_NODE(&last->rb_node)); + + if (rbprev != NULL) + prev = rb_entry_rq(rbprev); + + if (rbnext != NULL) + next = rb_entry_rq(rbnext); + else { + rbnext = rb_first(&bfqq->sort_list); + if (rbnext && rbnext != &last->rb_node) + next = rb_entry_rq(rbnext); + } + + return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); +} + +/* see the definition of bfq_async_charge_factor for details */ +static inline unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) +{ + return blk_rq_sectors(rq) * + (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * + bfq_async_charge_factor)); +} + +/** + * bfq_updated_next_req - update the queue after a new next_rq selection. + * @bfqd: the device data the queue belongs to. + * @bfqq: the queue to update. + * + * If the first request of a queue changes we make sure that the queue + * has enough budget to serve at least its first request (if the + * request has grown). We do this because if the queue has not enough + * budget for its first request, it has to go through two dispatch + * rounds to actually get it dispatched. + */ +static void bfq_updated_next_req(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + struct request *next_rq = bfqq->next_rq; + unsigned long new_budget; + + if (next_rq == NULL) + return; + + if (bfqq == bfqd->in_service_queue) + /* + * In order not to break guarantees, budgets cannot be + * changed after an entity has been selected. + */ + return; + + BUG_ON(entity->tree != &st->active); + BUG_ON(entity == entity->sched_data->in_service_entity); + + new_budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + if (entity->budget != new_budget) { + entity->budget = new_budget; + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", + new_budget); + bfq_activate_bfqq(bfqd, bfqq); + } +} + +static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd) +{ + u64 dur; + + if (bfqd->bfq_wr_max_time > 0) + return bfqd->bfq_wr_max_time; + + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); + + return dur; +} + +static inline unsigned +bfq_bfqq_cooperations(struct bfq_queue *bfqq) +{ + return bfqq->bic ? bfqq->bic->cooperations : 0; +} + +static inline void +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +{ + if (bic->saved_idle_window) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); + if (bic->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); + else + bfq_clear_bfqq_IO_bound(bfqq); + /* Assuming that the flag in_large_burst is already correctly set */ + if (bic->wr_time_left && bfqq->bfqd->low_latency && + !bfq_bfqq_in_large_burst(bfqq) && + bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { + /* + * Start a weight raising period with the duration given by + * the raising_time_left snapshot. + */ + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues++; + bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bic->wr_time_left; + bfqq->last_wr_start_finish = jiffies; + bfqq->entity.ioprio_changed = 1; + } + /* + * Clear wr_time_left to prevent bfq_bfqq_save_state() from + * getting confused about the queue's need of a weight-raising + * period. + */ + bic->wr_time_left = 0; +} + +/* Must be called with the queue_lock held. */ +static int bfqq_process_refs(struct bfq_queue *bfqq) +{ + int process_refs, io_refs; + + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; + process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; + BUG_ON(process_refs < 0); + return process_refs; +} + +/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ +static inline void bfq_reset_burst_list(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_queue *item; + struct hlist_node *n; + + hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) + hlist_del_init(&item->burst_list_node); + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); + bfqd->burst_size = 1; +} + +/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ +static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* Increment burst size to take into account also bfqq */ + bfqd->burst_size++; + + if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { + struct bfq_queue *pos, *bfqq_item; + struct hlist_node *n; + + /* + * Enough queues have been activated shortly after each + * other to consider this burst as large. + */ + bfqd->large_burst = true; + + /* + * We can now mark all queues in the burst list as + * belonging to a large burst. + */ + hlist_for_each_entry(bfqq_item, &bfqd->burst_list, + burst_list_node) + bfq_mark_bfqq_in_large_burst(bfqq_item); + bfq_mark_bfqq_in_large_burst(bfqq); + + /* + * From now on, and until the current burst finishes, any + * new queue being activated shortly after the last queue + * was inserted in the burst can be immediately marked as + * belonging to a large burst. So the burst list is not + * needed any more. Remove it. + */ + hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, + burst_list_node) + hlist_del_init(&pos->burst_list_node); + } else /* burst not yet large: add bfqq to the burst list */ + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); +} + +/* + * If many queues happen to become active shortly after each other, then, + * to help the processes associated to these queues get their job done as + * soon as possible, it is usually better to not grant either weight-raising + * or device idling to these queues. In this comment we describe, firstly, + * the reasons why this fact holds, and, secondly, the next function, which + * implements the main steps needed to properly mark these queues so that + * they can then be treated in a different way. + * + * As for the terminology, we say that a queue becomes active, i.e., + * switches from idle to backlogged, either when it is created (as a + * consequence of the arrival of an I/O request), or, if already existing, + * when a new request for the queue arrives while the queue is idle. + * Bursts of activations, i.e., activations of different queues occurring + * shortly after each other, are typically caused by services or applications + * that spawn or reactivate many parallel threads/processes. Examples are + * systemd during boot or git grep. + * + * These services or applications benefit mostly from a high throughput: + * the quicker the requests of the activated queues are cumulatively served, + * the sooner the target job of these queues gets completed. As a consequence, + * weight-raising any of these queues, which also implies idling the device + * for it, is almost always counterproductive: in most cases it just lowers + * throughput. + * + * On the other hand, a burst of activations may be also caused by the start + * of an application that does not consist in a lot of parallel I/O-bound + * threads. In fact, with a complex application, the burst may be just a + * consequence of the fact that several processes need to be executed to + * start-up the application. To start an application as quickly as possible, + * the best thing to do is to privilege the I/O related to the application + * with respect to all other I/O. Therefore, the best strategy to start as + * quickly as possible an application that causes a burst of activations is + * to weight-raise all the queues activated during the burst. This is the + * exact opposite of the best strategy for the other type of bursts. + * + * In the end, to take the best action for each of the two cases, the two + * types of bursts need to be distinguished. Fortunately, this seems + * relatively easy to do, by looking at the sizes of the bursts. In + * particular, we found a threshold such that bursts with a larger size + * than that threshold are apparently caused only by services or commands + * such as systemd or git grep. For brevity, hereafter we call just 'large' + * these bursts. BFQ *does not* weight-raise queues whose activations occur + * in a large burst. In addition, for each of these queues BFQ performs or + * does not perform idling depending on which choice boosts the throughput + * most. The exact choice depends on the device and request pattern at + * hand. + * + * Turning back to the next function, it implements all the steps needed + * to detect the occurrence of a large burst and to properly mark all the + * queues belonging to it (so that they can then be treated in a different + * way). This goal is achieved by maintaining a special "burst list" that + * holds, temporarily, the queues that belong to the burst in progress. The + * list is then used to mark these queues as belonging to a large burst if + * the burst does become large. The main steps are the following. + * + * . when the very first queue is activated, the queue is inserted into the + * list (as it could be the first queue in a possible burst) + * + * . if the current burst has not yet become large, and a queue Q that does + * not yet belong to the burst is activated shortly after the last time + * at which a new queue entered the burst list, then the function appends + * Q to the burst list + * + * . if, as a consequence of the previous step, the burst size reaches + * the large-burst threshold, then + * + * . all the queues in the burst list are marked as belonging to a + * large burst + * + * . the burst list is deleted; in fact, the burst list already served + * its purpose (keeping temporarily track of the queues in a burst, + * so as to be able to mark them as belonging to a large burst in the + * previous sub-step), and now is not needed any more + * + * . the device enters a large-burst mode + * + * . if a queue Q that does not belong to the burst is activated while + * the device is in large-burst mode and shortly after the last time + * at which a queue either entered the burst list or was marked as + * belonging to the current large burst, then Q is immediately marked + * as belonging to a large burst. + * + * . if a queue Q that does not belong to the burst is activated a while + * later, i.e., not shortly after, than the last time at which a queue + * either entered the burst list or was marked as belonging to the + * current large burst, then the current burst is deemed as finished and: + * + * . the large-burst mode is reset if set + * + * . the burst list is emptied + * + * . Q is inserted in the burst list, as Q may be the first queue + * in a possible new burst (then the burst list contains just Q + * after this step). + */ +static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool idle_for_long_time) +{ + /* + * If bfqq happened to be activated in a burst, but has been idle + * for at least as long as an interactive queue, then we assume + * that, in the overall I/O initiated in the burst, the I/O + * associated to bfqq is finished. So bfqq does not need to be + * treated as a queue belonging to a burst anymore. Accordingly, + * we reset bfqq's in_large_burst flag if set, and remove bfqq + * from the burst list if it's there. We do not decrement instead + * burst_size, because the fact that bfqq does not need to belong + * to the burst list any more does not invalidate the fact that + * bfqq may have been activated during the current burst. + */ + if (idle_for_long_time) { + hlist_del_init(&bfqq->burst_list_node); + bfq_clear_bfqq_in_large_burst(bfqq); + } + + /* + * If bfqq is already in the burst list or is part of a large + * burst, then there is nothing else to do. + */ + if (!hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq)) + return; + + /* + * If bfqq's activation happens late enough, then the current + * burst is finished, and related data structures must be reset. + * + * In this respect, consider the special case where bfqq is the very + * first queue being activated. In this case, last_ins_in_burst is + * not yet significant when we get here. But it is easy to verify + * that, whether or not the following condition is true, bfqq will + * end up being inserted into the burst list. In particular the + * list will happen to contain only bfqq. And this is exactly what + * has to happen, as bfqq may be the first queue in a possible + * burst. + */ + if (time_is_before_jiffies(bfqd->last_ins_in_burst + + bfqd->bfq_burst_interval)) { + bfqd->large_burst = false; + bfq_reset_burst_list(bfqd, bfqq); + return; + } + + /* + * If we get here, then bfqq is being activated shortly after the + * last queue. So, if the current burst is also large, we can mark + * bfqq as belonging to this large burst immediately. + */ + if (bfqd->large_burst) { + bfq_mark_bfqq_in_large_burst(bfqq); + return; + } + + /* + * If we get here, then a large-burst state has not yet been + * reached, but bfqq is being activated shortly after the last + * queue. Then we add bfqq to the burst. + */ + bfq_add_to_burst(bfqd, bfqq); +} + +static void bfq_add_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_entity *entity = &bfqq->entity; + struct bfq_data *bfqd = bfqq->bfqd; + struct request *next_rq, *prev; + unsigned long old_wr_coeff = bfqq->wr_coeff; + bool interactive = false; + + bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); + bfqq->queued[rq_is_sync(rq)]++; + bfqd->queued++; + + elv_rb_add(&bfqq->sort_list, rq); + + /* + * Check if this request is a better next-serve candidate. + */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); + BUG_ON(next_rq == NULL); + bfqq->next_rq = next_rq; + + /* + * Adjust priority tree position, if next_rq changes. + */ + if (prev != bfqq->next_rq) + bfq_rq_pos_tree_add(bfqd, bfqq); + + if (!bfq_bfqq_busy(bfqq)) { + bool soft_rt, coop_or_in_burst, + idle_for_long_time = time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); + + if (bfq_bfqq_sync(bfqq)) { + bool already_in_burst = + !hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq); + bfq_handle_burst(bfqd, bfqq, idle_for_long_time); + /* + * If bfqq was not already in the current burst, + * then, at this point, bfqq either has been + * added to the current burst or has caused the + * current burst to terminate. In particular, in + * the second case, bfqq has become the first + * queue in a possible new burst. + * In both cases last_ins_in_burst needs to be + * moved forward. + */ + if (!already_in_burst) + bfqd->last_ins_in_burst = jiffies; + } + + coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || + bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + !coop_or_in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); + interactive = !coop_or_in_burst && idle_for_long_time; + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (time_before(jiffies, + RQ_BIC(rq)->ttime.last_end_request + + bfqd->bfq_slice_idle)) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + } + + if (!bfqd->low_latency) + goto add_bfqq_busy; + + if (bfq_bfqq_just_split(bfqq)) + goto set_ioprio_changed; + + /* + * If the queue: + * - is not being boosted, + * - has been idle for enough time, + * - is not a sync queue or is linked to a bfq_io_cq (it is + * shared "for its nature" or it is not shared and its + * requests have not been redirected to a shared queue) + * start a weight-raising period. + */ + if (old_wr_coeff == 1 && (interactive || soft_rt) && + (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + else + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfq_log_bfqq(bfqd, bfqq, + "wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } else if (old_wr_coeff > 1) { + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + else if (coop_or_in_burst || + (bfqq->wr_cur_max_time == + bfqd->bfq_wr_rt_max_time && + !soft_rt)) { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq-> + wr_cur_max_time)); + } else if (time_before( + bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time, + jiffies + + bfqd->bfq_wr_rt_max_time) && + soft_rt) { + /* + * + * The remaining weight-raising time is lower + * than bfqd->bfq_wr_rt_max_time, which means + * that the application is enjoying weight + * raising either because deemed soft-rt in + * the near past, or because deemed interactive + * a long ago. + * In both cases, resetting now the current + * remaining weight-raising time for the + * application to the weight-raising duration + * for soft rt applications would not cause any + * latency increase for the application (as the + * new duration would be higher than the + * remaining time). + * + * In addition, the application is now meeting + * the requirements for being deemed soft rt. + * In the end we can correctly and safely + * (re)charge the weight-raising duration for + * the application with the weight-raising + * duration for soft rt applications. + * + * In particular, doing this recharge now, i.e., + * before the weight-raising period for the + * application finishes, reduces the probability + * of the following negative scenario: + * 1) the weight of a soft rt application is + * raised at startup (as for any newly + * created application), + * 2) since the application is not interactive, + * at a certain time weight-raising is + * stopped for the application, + * 3) at that time the application happens to + * still have pending requests, and hence + * is destined to not have a chance to be + * deemed soft rt before these requests are + * completed (see the comments to the + * function bfq_bfqq_softrt_next_start() + * for details on soft rt detection), + * 4) these pending requests experience a high + * latency because the application is not + * weight-raised while they are pending. + */ + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + } + } +set_ioprio_changed: + if (old_wr_coeff != bfqq->wr_coeff) + entity->ioprio_changed = 1; +add_bfqq_busy: + bfqq->last_idle_bklogged = jiffies; + bfqq->service_from_backlogged = 0; + bfq_clear_bfqq_softrt_update(bfqq); + bfq_add_bfqq_busy(bfqd, bfqq); + } else { + if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && + time_is_before_jiffies( + bfqq->last_wr_start_finish + + bfqd->bfq_wr_min_inter_arr_async)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + + bfqd->wr_busy_queues++; + entity->ioprio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "non-idle wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + if (prev != bfqq->next_rq) + bfq_updated_next_req(bfqd, bfqq); + } + + if (bfqd->low_latency && + (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) + bfqq->last_wr_start_finish = jiffies; +} + +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, + struct bio *bio) +{ + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (bic == NULL) + return NULL; + + bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); + if (bfqq != NULL) + return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); + + return NULL; +} + +static void bfq_activate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + bfqd->rq_in_driver++; + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", + (long long unsigned)bfqd->last_position); +} + +static inline void bfq_deactivate_request(struct request_queue *q, + struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + BUG_ON(bfqd->rq_in_driver == 0); + bfqd->rq_in_driver--; +} + +static void bfq_remove_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + const int sync = rq_is_sync(rq); + + if (bfqq->next_rq == rq) { + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); + bfq_updated_next_req(bfqd, bfqq); + } + + if (rq->queuelist.prev != &rq->queuelist) + list_del_init(&rq->queuelist); + BUG_ON(bfqq->queued[sync] == 0); + bfqq->queued[sync]--; + bfqd->queued--; + elv_rb_del(&bfqq->sort_list, rq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) + bfq_del_bfqq_busy(bfqd, bfqq, 1); + /* + * Remove queue from request-position tree as it is empty. + */ + if (bfqq->pos_root != NULL) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + } + + if (rq->cmd_flags & REQ_META) { + BUG_ON(bfqq->meta_pending == 0); + bfqq->meta_pending--; + } +} + +static int bfq_merge(struct request_queue *q, struct request **req, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *__rq; + + __rq = bfq_find_rq_fmerge(bfqd, bio); + if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { + *req = __rq; + return ELEVATOR_FRONT_MERGE; + } + + return ELEVATOR_NO_MERGE; +} + +static void bfq_merged_request(struct request_queue *q, struct request *req, + int type) +{ + if (type == ELEVATOR_FRONT_MERGE && + rb_prev(&req->rb_node) && + blk_rq_pos(req) < + blk_rq_pos(container_of(rb_prev(&req->rb_node), + struct request, rb_node))) { + struct bfq_queue *bfqq = RQ_BFQQ(req); + struct bfq_data *bfqd = bfqq->bfqd; + struct request *prev, *next_rq; + + /* Reposition request in its sort_list */ + elv_rb_del(&bfqq->sort_list, req); + elv_rb_add(&bfqq->sort_list, req); + /* Choose next request to be served for bfqq */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, + bfqd->last_position); + BUG_ON(next_rq == NULL); + bfqq->next_rq = next_rq; + /* + * If next_rq changes, update both the queue's budget to + * fit the new request and the queue's position in its + * rq_pos_tree. + */ + if (prev != bfqq->next_rq) { + bfq_updated_next_req(bfqd, bfqq); + bfq_rq_pos_tree_add(bfqd, bfqq); + } + } +} + +static void bfq_merged_requests(struct request_queue *q, struct request *rq, + struct request *next) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + + /* + * If next and rq belong to the same bfq_queue and next is older + * than rq, then reposition rq in the fifo (by substituting next + * with rq). Otherwise, if next and rq belong to different + * bfq_queues, never reposition rq: in fact, we would have to + * reposition it with respect to next's position in its own fifo, + * which would most certainly be too expensive with respect to + * the benefits. + */ + if (bfqq == next_bfqq && + !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && + time_before(next->fifo_time, rq->fifo_time)) { + list_del_init(&rq->queuelist); + list_replace_init(&next->queuelist, &rq->queuelist); + rq->fifo_time = next->fifo_time; + } + + if (bfqq->next_rq == next) + bfqq->next_rq = rq; + + bfq_remove_request(next); +} + +/* Must be called with bfqq != NULL */ +static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq) +{ + BUG_ON(bfqq == NULL); + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues--; + bfqq->wr_coeff = 1; + bfqq->wr_cur_max_time = 0; + /* Trigger a weight change on the next activation of the queue */ + bfqq->entity.ioprio_changed = 1; +} + +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + if (bfqg->async_bfqq[i][j] != NULL) + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); + if (bfqg->async_idle_bfqq != NULL) + bfq_bfqq_end_wr(bfqg->async_idle_bfqq); +} + +static void bfq_end_wr(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + + spin_lock_irq(bfqd->queue->queue_lock); + + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + bfq_end_wr_async(bfqd); + + spin_unlock_irq(bfqd->queue->queue_lock); +} + +static inline sector_t bfq_io_struct_pos(void *io_struct, bool request) +{ + if (request) + return blk_rq_pos(io_struct); + else + return ((struct bio *)io_struct)->bi_iter.bi_sector; +} + +static inline sector_t bfq_dist_from(sector_t pos1, + sector_t pos2) +{ + if (pos1 >= pos2) + return pos1 - pos2; + else + return pos2 - pos1; +} + +static inline int bfq_rq_close_to_sector(void *io_struct, bool request, + sector_t sector) +{ + return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <= + BFQQ_SEEK_THR; +} + +static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) +{ + struct rb_root *root = &bfqd->rq_pos_tree; + struct rb_node *parent, *node; + struct bfq_queue *__bfqq; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + /* + * First, if we find a request starting at the end of the last + * request, choose it. + */ + __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); + if (__bfqq != NULL) + return __bfqq; + + /* + * If the exact sector wasn't found, the parent of the NULL leaf + * will contain the closest sector (rq_pos_tree sorted by + * next_request position). + */ + __bfqq = rb_entry(parent, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + if (blk_rq_pos(__bfqq->next_rq) < sector) + node = rb_next(&__bfqq->pos_node); + else + node = rb_prev(&__bfqq->pos_node); + if (node == NULL) + return NULL; + + __bfqq = rb_entry(node, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + return NULL; +} + +/* + * bfqd - obvious + * cur_bfqq - passed in so that we don't decide that the current queue + * is closely cooperating with itself + * sector - used as a reference point to search for a close queue + */ +static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, + struct bfq_queue *cur_bfqq, + sector_t sector) +{ + struct bfq_queue *bfqq; + + if (bfq_class_idle(cur_bfqq)) + return NULL; + if (!bfq_bfqq_sync(cur_bfqq)) + return NULL; + if (BFQQ_SEEKY(cur_bfqq)) + return NULL; + + /* If device has only one backlogged bfq_queue, don't search. */ + if (bfqd->busy_queues == 1) + return NULL; + + /* + * We should notice if some of the queues are cooperating, e.g. + * working closely on the same area of the disk. In that case, + * we can group them together and don't waste time idling. + */ + bfqq = bfqq_close(bfqd, sector); + if (bfqq == NULL || bfqq == cur_bfqq) + return NULL; + + /* + * Do not merge queues from different bfq_groups. + */ + if (bfqq->entity.parent != cur_bfqq->entity.parent) + return NULL; + + /* + * It only makes sense to merge sync queues. + */ + if (!bfq_bfqq_sync(bfqq)) + return NULL; + if (BFQQ_SEEKY(bfqq)) + return NULL; + + /* + * Do not merge queues of different priority classes. + */ + if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) + return NULL; + + return bfqq; +} + +static struct bfq_queue * +bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + int process_refs, new_process_refs; + struct bfq_queue *__bfqq; + + /* + * If there are no process references on the new_bfqq, then it is + * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain + * may have dropped their last reference (not just their last process + * reference). + */ + if (!bfqq_process_refs(new_bfqq)) + return NULL; + + /* Avoid a circular list and skip interim queue merges. */ + while ((__bfqq = new_bfqq->new_bfqq)) { + if (__bfqq == bfqq) + return NULL; + new_bfqq = __bfqq; + } + + process_refs = bfqq_process_refs(bfqq); + new_process_refs = bfqq_process_refs(new_bfqq); + /* + * If the process for the bfqq has gone away, there is no + * sense in merging the queues. + */ + if (process_refs == 0 || new_process_refs == 0) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", + new_bfqq->pid); + + /* + * Merging is just a redirection: the requests of the process + * owning one of the two queues are redirected to the other queue. + * The latter queue, in its turn, is set as shared if this is the + * first time that the requests of some process are redirected to + * it. + * + * We redirect bfqq to new_bfqq and not the opposite, because we + * are in the context of the process owning bfqq, hence we have + * the io_cq of this process. So we can immediately configure this + * io_cq to redirect the requests of the process to new_bfqq. + * + * NOTE, even if new_bfqq coincides with the in-service queue, the + * io_cq of new_bfqq is not available, because, if the in-service + * queue is shared, bfqd->in_service_bic may not point to the + * io_cq of the in-service queue. + * Redirecting the requests of the process owning bfqq to the + * currently in-service queue is in any case the best option, as + * we feed the in-service queue with new requests close to the + * last request served and, by doing so, hopefully increase the + * throughput. + */ + bfqq->new_bfqq = new_bfqq; + atomic_add(process_refs, &new_bfqq->ref); + return new_bfqq; +} + +/* + * Attempt to schedule a merge of bfqq with the currently in-service queue + * or with a close queue among the scheduled queues. + * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue + * structure otherwise. + * + * The OOM queue is not allowed to participate to cooperation: in fact, since + * the requests temporarily redirected to the OOM queue could be redirected + * again to dedicated queues at any time, the state needed to correctly + * handle merging with the OOM queue would be quite complex and expensive + * to maintain. Besides, in such a critical condition as an out of memory, + * the benefits of queue merging may be little relevant, or even negligible. + */ +static struct bfq_queue * +bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + void *io_struct, bool request) +{ + struct bfq_queue *in_service_bfqq, *new_bfqq; + + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; + + in_service_bfqq = bfqd->in_service_queue; + + if (in_service_bfqq == NULL || in_service_bfqq == bfqq || + !bfqd->in_service_bic || + unlikely(in_service_bfqq == &bfqd->oom_bfqq)) + goto check_scheduled; + + if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq)) + goto check_scheduled; + + if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq)) + goto check_scheduled; + + if (in_service_bfqq->entity.parent != bfqq->entity.parent) + goto check_scheduled; + + if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) { + new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); + if (new_bfqq != NULL) + return new_bfqq; /* Merge with in-service queue */ + } + + /* + * Check whether there is a cooperator among currently scheduled + * queues. The only thing we need is that the bio/request is not + * NULL, as we need it to establish whether a cooperator exists. + */ +check_scheduled: + new_bfqq = bfq_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + + return NULL; +} + +static inline void +bfq_bfqq_save_state(struct bfq_queue *bfqq) +{ + /* + * If bfqq->bic == NULL, the queue is already shared or its requests + * have already been redirected to a shared queue; both idle window + * and weight raising state have already been saved. Do nothing. + */ + if (bfqq->bic == NULL) + return; + if (bfqq->bic->wr_time_left) + /* + * This is the queue of a just-started process, and would + * deserve weight raising: we set wr_time_left to the full + * weight-raising duration to trigger weight-raising when + * and if the queue is split and the first request of the + * queue is enqueued. + */ + bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); + else if (bfqq->wr_coeff > 1) { + unsigned long wr_duration = + jiffies - bfqq->last_wr_start_finish; + /* + * It may happen that a queue's weight raising period lasts + * longer than its wr_cur_max_time, as weight raising is + * handled only when a request is enqueued or dispatched (it + * does not use any timer). If the weight raising period is + * about to end, don't save it. + */ + if (bfqq->wr_cur_max_time <= wr_duration) + bfqq->bic->wr_time_left = 0; + else + bfqq->bic->wr_time_left = + bfqq->wr_cur_max_time - wr_duration; + /* + * The bfq_queue is becoming shared or the requests of the + * process owning the queue are being redirected to a shared + * queue. Stop the weight raising period of the queue, as in + * both cases it should not be owned by an interactive or + * soft real-time application. + */ + bfq_bfqq_end_wr(bfqq); + } else + bfqq->bic->wr_time_left = 0; + bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bfqq->bic->cooperations++; + bfqq->bic->failed_cooperations = 0; +} + +static inline void +bfq_get_bic_reference(struct bfq_queue *bfqq) +{ + /* + * If bfqq->bic has a non-NULL value, the bic to which it belongs + * is about to begin using a shared bfq_queue. + */ + if (bfqq->bic) + atomic_long_inc(&bfqq->bic->icq.ioc->refcount); +} + +static void +bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (long unsigned)new_bfqq->pid); + /* Save weight raising and idle window of the merged queues */ + bfq_bfqq_save_state(bfqq); + bfq_bfqq_save_state(new_bfqq); + if (bfq_bfqq_IO_bound(bfqq)) + bfq_mark_bfqq_IO_bound(new_bfqq); + bfq_clear_bfqq_IO_bound(bfqq); + /* + * Grab a reference to the bic, to prevent it from being destroyed + * before being possibly touched by a bfq_split_bfqq(). + */ + bfq_get_bic_reference(bfqq); + bfq_get_bic_reference(new_bfqq); + /* + * Merge queues (that is, let bic redirect its requests to new_bfqq) + */ + bic_set_bfqq(bic, new_bfqq, 1); + bfq_mark_bfqq_coop(new_bfqq); + /* + * new_bfqq now belongs to at least two bics (it is a shared queue): + * set new_bfqq->bic to NULL. bfqq either: + * - does not belong to any bic any more, and hence bfqq->bic must + * be set to NULL, or + * - is a queue whose owning bics have already been redirected to a + * different queue, hence the queue is destined to not belong to + * any bic soon and bfqq->bic is already NULL (therefore the next + * assignment causes no harm). + */ + new_bfqq->bic = NULL; + bfqq->bic = NULL; + bfq_put_queue(bfqq); +} + +static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) +{ + struct bfq_io_cq *bic = bfqq->bic; + struct bfq_data *bfqd = bfqq->bfqd; + + if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { + bic->failed_cooperations++; + if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) + bic->cooperations = 0; + } +} + +static int bfq_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq, *new_bfqq; + + /* + * Disallow merge of a sync bio into an async request. + */ + if (bfq_bio_sync(bio) && !rq_is_sync(rq)) + return 0; + + /* + * Lookup the bfqq that this bio will be queued with. Allow + * merge only if rq is queued there. + * Queue lock is held here. + */ + bic = bfq_bic_lookup(bfqd, current->io_context); + if (bic == NULL) + return 0; + + bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); + /* + * We take advantage of this function to perform an early merge + * of the queues of possible cooperating processes. + */ + if (bfqq != NULL) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + if (new_bfqq != NULL) { + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + /* + * If we get here, the bio will be queued in the + * shared queue, i.e., new_bfqq, so use new_bfqq + * to decide whether bio and rq can be merged. + */ + bfqq = new_bfqq; + } else + bfq_bfqq_increase_failed_cooperations(bfqq); + } + + return bfqq == RQ_BFQQ(rq); +} + +static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + if (bfqq != NULL) { + bfq_mark_bfqq_must_alloc(bfqq); + bfq_mark_bfqq_budget_new(bfqq); + bfq_clear_bfqq_fifo_expire(bfqq); + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + + bfq_log_bfqq(bfqd, bfqq, + "set_in_service_queue, cur-budget = %lu", + bfqq->entity.budget); + } + + bfqd->in_service_queue = bfqq; +} + +/* + * Get and set a new queue for service. + */ +static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); + + __bfq_set_in_service_queue(bfqd, bfqq); + return bfqq; +} + +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < 194) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < 194) + return bfq_default_max_budget / 32; + else + return bfqd->bfq_max_budget / 32; +} + +static void bfq_arm_slice_timer(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + struct bfq_io_cq *bic; + unsigned long sl; + + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + /* Processes have exited, don't wait. */ + bic = bfqd->in_service_bic; + if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0) + return; + + bfq_mark_bfqq_wait_request(bfqq); + + /* + * We don't want to idle for seeks, but we do want to allow + * fair distribution of slice time for a process doing back-to-back + * seeks. So allow a little bit of time for him to submit a new rq. + * + * To prevent processes with (partly) seeky workloads from + * being too ill-treated, grant them a small fraction of the + * assigned budget before reducing the waiting time to + * BFQ_MIN_TT. This happened to help reduce latency. + */ + sl = bfqd->bfq_slice_idle; + /* + * Unless the queue is being weight-raised or the scenario is + * asymmetric, grant only minimum idle time if the queue either + * has been seeky for long enough or has already proved to be + * constantly seeky. + */ + if (bfq_sample_valid(bfqq->seek_samples) && + ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > + bfq_max_budget(bfqq->bfqd) / 8) || + bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && + symmetric_scenario) + sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); + else if (bfqq->wr_coeff > 1) + sl = sl * 3; + bfqd->last_idling_start = ktime_get(); + mod_timer(&bfqd->idle_slice_timer, jiffies + sl); + bfq_log(bfqd, "arm idle: %u/%u ms", + jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); +} + +/* + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the disk + * throughput (always guaranteed with a time slice scheme as in CFQ). + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + unsigned int timeout_coeff; + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + + bfqd->last_budget_start = ktime_get(); + + bfq_clear_bfqq_budget_new(bfqq); + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; + + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", + jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * + timeout_coeff)); +} + +/* + * Move request from internal lists to the request queue dispatch list. + */ +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + /* + * For consistency, the next instruction should have been executed + * after removing the request from the queue and dispatching it. + * We execute instead this instruction before bfq_remove_request() + * (and hence introduce a temporary inconsistency), for efficiency. + * In fact, in a forced_dispatch, this prevents two counters related + * to bfqq->dispatched to risk to be uselessly decremented if bfqq + * is not in service, and then to be incremented again after + * incrementing bfqq->dispatched. + */ + bfqq->dispatched++; + bfq_remove_request(rq); + elv_dispatch_sort(q, rq); + + if (bfq_bfqq_sync(bfqq)) + bfqd->sync_flight++; +} + +/* + * Return expired entry, or NULL to just start from scratch in rbtree. + */ +static struct request *bfq_check_fifo(struct bfq_queue *bfqq) +{ + struct request *rq = NULL; + + if (bfq_bfqq_fifo_expire(bfqq)) + return NULL; + + bfq_mark_bfqq_fifo_expire(bfqq); + + if (list_empty(&bfqq->fifo)) + return NULL; + + rq = rq_entry_fifo(bfqq->fifo.next); + + if (time_before(jiffies, rq->fifo_time)) + return NULL; + + return rq; +} + +static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + return entity->budget - entity->service; +} + +static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfqq != bfqd->in_service_queue); + + __bfq_bfqd_reset_in_service(bfqd); + + /* + * If this bfqq is shared between multiple processes, check + * to make sure that those processes are still issuing I/Os + * within the mean seek distance. If not, it may be time to + * break the queues apart again. + */ + if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) + bfq_mark_bfqq_split_coop(bfqq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * Overloading budget_timeout field to store the time + * at which the queue remains with no backlog; used by + * the weight-raising mechanism. + */ + bfqq->budget_timeout = jiffies; + bfq_del_bfqq_busy(bfqd, bfqq, 1); + } else { + bfq_activate_bfqq(bfqd, bfqq); + /* + * Resort priority tree of potential close cooperators. + */ + bfq_rq_pos_tree_add(bfqd, bfqq); + } +} + +/** + * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. + * @bfqd: device data. + * @bfqq: queue to update. + * @reason: reason for expiration. + * + * Handle the feedback on @bfqq budget. See the body for detailed + * comments. + */ +static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + enum bfqq_expiration reason) +{ + struct request *next_rq; + unsigned long budget, min_budget; + + budget = bfqq->max_budget; + min_budget = bfq_min_budget(bfqd); + + BUG_ON(bfqq != bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", + budget, bfq_min_budget(bfqd)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); + + if (bfq_bfqq_sync(bfqq)) { + switch (reason) { + /* + * Caveat: in all the following cases we trade latency + * for throughput. + */ + case BFQ_BFQQ_TOO_IDLE: + /* + * This is the only case where we may reduce + * the budget: if there is no request of the + * process still waiting for completion, then + * we assume (tentatively) that the timer has + * expired because the batch of requests of + * the process could have been served with a + * smaller budget. Hence, betting that + * process will behave in the same way when it + * becomes backlogged again, we reduce its + * next budget. As long as we guess right, + * this budget cut reduces the latency + * experienced by the process. + * + * However, if there are still outstanding + * requests, then the process may have not yet + * issued its next request just because it is + * still waiting for the completion of some of + * the still outstanding ones. So in this + * subcase we do not reduce its budget, on the + * contrary we increase it to possibly boost + * the throughput, as discussed in the + * comments to the BUDGET_TIMEOUT case. + */ + if (bfqq->dispatched > 0) /* still outstanding reqs */ + budget = min(budget * 2, bfqd->bfq_max_budget); + else { + if (budget > 5 * min_budget) + budget -= 4 * min_budget; + else + budget = min_budget; + } + break; + case BFQ_BFQQ_BUDGET_TIMEOUT: + /* + * We double the budget here because: 1) it + * gives the chance to boost the throughput if + * this is not a seeky process (which may have + * bumped into this timeout because of, e.g., + * ZBR), 2) together with charge_full_budget + * it helps give seeky processes higher + * timestamps, and hence be served less + * frequently. + */ + budget = min(budget * 2, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_BUDGET_EXHAUSTED: + /* + * The process still has backlog, and did not + * let either the budget timeout or the disk + * idling timeout expire. Hence it is not + * seeky, has a short thinktime and may be + * happy with a higher budget too. So + * definitely increase the budget of this good + * candidate to boost the disk throughput. + */ + budget = min(budget * 4, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_NO_MORE_REQUESTS: + /* + * Leave the budget unchanged. + */ + default: + return; + } + } else /* async queue */ + /* async queues get always the maximum possible budget + * (their ability to dispatch is limited by + * @bfqd->bfq_max_budget_async_rq). + */ + budget = bfqd->bfq_max_budget; + + bfqq->max_budget = budget; + + if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && + bfqq->max_budget > bfqd->bfq_max_budget) + bfqq->max_budget = bfqd->bfq_max_budget; + + /* + * Make sure that we have enough budget for the next request. + * Since the finish time of the bfqq must be kept in sync with + * the budget, be sure to call __bfq_bfqq_expire() after the + * update. + */ + next_rq = bfqq->next_rq; + if (next_rq != NULL) + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + else + bfqq->entity.budget = bfqq->max_budget; + + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", + next_rq != NULL ? blk_rq_sectors(next_rq) : 0, + bfqq->entity.budget); +} + +static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) +{ + unsigned long max_budget; + + /* + * The max_budget calculated when autotuning is equal to the + * amount of sectors transfered in timeout_sync at the + * estimated peak rate. + */ + max_budget = (unsigned long)(peak_rate * 1000 * + timeout >> BFQ_RATE_SHIFT); + + return max_budget; +} + +/* + * In addition to updating the peak rate, checks whether the process + * is "slow", and returns 1 if so. This slow flag is used, in addition + * to the budget timeout, to reduce the amount of service provided to + * seeky processes, and hence reduce their chances to lower the + * throughput. See the code for more details. + */ +static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int compensate, enum bfqq_expiration reason) +{ + u64 bw, usecs, expected, timeout; + ktime_t delta; + int update = 0; + + if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) + return 0; + + if (compensate) + delta = bfqd->last_idling_start; + else + delta = ktime_get(); + delta = ktime_sub(delta, bfqd->last_budget_start); + usecs = ktime_to_us(delta); + + /* Don't trust short/unrealistic values. */ + if (usecs < 100 || usecs >= LONG_MAX) + return 0; + + /* + * Calculate the bandwidth for the last slice. We use a 64 bit + * value to store the peak rate, in sectors per usec in fixed + * point math. We do so to have enough precision in the estimate + * and to avoid overflows. + */ + bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; + do_div(bw, (unsigned long)usecs); + + timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + /* + * Use only long (> 20ms) intervals to filter out spikes for + * the peak rate estimation. + */ + if (usecs > 20000) { + if (bw > bfqd->peak_rate || + (!BFQQ_SEEKY(bfqq) && + reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { + bfq_log(bfqd, "measured bw =%llu", bw); + /* + * To smooth oscillations use a low-pass filter with + * alpha=7/8, i.e., + * new_rate = (7/8) * old_rate + (1/8) * bw + */ + do_div(bw, 8); + if (bw == 0) + return 0; + bfqd->peak_rate *= 7; + do_div(bfqd->peak_rate, 8); + bfqd->peak_rate += bw; + update = 1; + bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); + } + + update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; + + if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) + bfqd->peak_rate_samples++; + + if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && + update) { + int dev_type = blk_queue_nonrot(bfqd->queue); + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd->peak_rate, + timeout); + bfq_log(bfqd, "new max_budget=%lu", + bfqd->bfq_max_budget); + } + if (bfqd->device_speed == BFQ_BFQD_FAST && + bfqd->peak_rate < device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_SLOW; + bfqd->RT_prod = R_slow[dev_type] * + T_slow[dev_type]; + } else if (bfqd->device_speed == BFQ_BFQD_SLOW && + bfqd->peak_rate > device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } + } + } + + /* + * If the process has been served for a too short time + * interval to let its possible sequential accesses prevail on + * the initial seek time needed to move the disk head on the + * first sector it requested, then give the process a chance + * and for the moment return false. + */ + if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) + return 0; + + /* + * A process is considered ``slow'' (i.e., seeky, so that we + * cannot treat it fairly in the service domain, as it would + * slow down too much the other processes) if, when a slice + * ends for whatever reason, it has received service at a + * rate that would not be high enough to complete the budget + * before the budget timeout expiration. + */ + expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; + + /* + * Caveat: processes doing IO in the slower disk zones will + * tend to be slow(er) even if not seeky. And the estimated + * peak rate will actually be an average over the disk + * surface. Hence, to not be too harsh with unlucky processes, + * we keep a budget/3 margin of safety before declaring a + * process slow. + */ + return expected > (4 * bfqq->entity.budget) / 3; +} + +/* + * To be deemed as soft real-time, an application must meet two + * requirements. First, the application must not require an average + * bandwidth higher than the approximate bandwidth required to playback or + * record a compressed high-definition video. + * The next function is invoked on the completion of the last request of a + * batch, to compute the next-start time instant, soft_rt_next_start, such + * that, if the next request of the application does not arrive before + * soft_rt_next_start, then the above requirement on the bandwidth is met. + * + * The second requirement is that the request pattern of the application is + * isochronous, i.e., that, after issuing a request or a batch of requests, + * the application stops issuing new requests until all its pending requests + * have been completed. After that, the application may issue a new batch, + * and so on. + * For this reason the next function is invoked to compute + * soft_rt_next_start only for applications that meet this requirement, + * whereas soft_rt_next_start is set to infinity for applications that do + * not. + * + * Unfortunately, even a greedy application may happen to behave in an + * isochronous way if the CPU load is high. In fact, the application may + * stop issuing requests while the CPUs are busy serving other processes, + * then restart, then stop again for a while, and so on. In addition, if + * the disk achieves a low enough throughput with the request pattern + * issued by the application (e.g., because the request pattern is random + * and/or the device is slow), then the application may meet the above + * bandwidth requirement too. To prevent such a greedy application to be + * deemed as soft real-time, a further rule is used in the computation of + * soft_rt_next_start: soft_rt_next_start must be higher than the current + * time plus the maximum time for which the arrival of a request is waited + * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. + * This filters out greedy applications, as the latter issue instead their + * next request as soon as possible after the last one has been completed + * (in contrast, when a batch of requests is completed, a soft real-time + * application spends some time processing data). + * + * Unfortunately, the last filter may easily generate false positives if + * only bfqd->bfq_slice_idle is used as a reference time interval and one + * or both the following cases occur: + * 1) HZ is so low that the duration of a jiffy is comparable to or higher + * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with + * HZ=100. + * 2) jiffies, instead of increasing at a constant rate, may stop increasing + * for a while, then suddenly 'jump' by several units to recover the lost + * increments. This seems to happen, e.g., inside virtual machines. + * To address this issue, we do not use as a reference time interval just + * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In + * particular we add the minimum number of jiffies for which the filter + * seems to be quite precise also in embedded systems and KVM/QEMU virtual + * machines. + */ +static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return max(bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, + jiffies + bfqq->bfqd->bfq_slice_idle + 4); +} + +/* + * Return the largest-possible time instant such that, for as long as possible, + * the current time will be lower than this time instant according to the macro + * time_is_before_jiffies(). + */ +static inline unsigned long bfq_infinity_from_now(unsigned long now) +{ + return now + ULONG_MAX / 2; +} + +/** + * bfq_bfqq_expire - expire a queue. + * @bfqd: device owning the queue. + * @bfqq: the queue to expire. + * @compensate: if true, compensate for the time spent idling. + * @reason: the reason causing the expiration. + * + * + * If the process associated to the queue is slow (i.e., seeky), or in + * case of budget timeout, or, finally, if it is async, we + * artificially charge it an entire budget (independently of the + * actual service it received). As a consequence, the queue will get + * higher timestamps than the correct ones upon reactivation, and + * hence it will be rescheduled as if it had received more service + * than what it actually received. In the end, this class of processes + * will receive less service in proportion to how slowly they consume + * their budgets (and hence how seriously they tend to lower the + * throughput). + * + * In contrast, when a queue expires because it has been idling for + * too much or because it exhausted its budget, we do not touch the + * amount of service it has received. Hence when the queue will be + * reactivated and its timestamps updated, the latter will be in sync + * with the actual service received by the queue until expiration. + * + * Charging a full budget to the first type of queues and the exact + * service to the others has the effect of using the WF2Q+ policy to + * schedule the former on a timeslice basis, without violating the + * service domain guarantees of the latter. + */ +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + int compensate, + enum bfqq_expiration reason) +{ + int slow; + BUG_ON(bfqq != bfqd->in_service_queue); + + /* Update disk peak rate for autotuning and check whether the + * process is slow (see bfq_update_peak_rate). + */ + slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); + + /* + * As above explained, 'punish' slow (i.e., seeky), timed-out + * and async queues, to favor sequential sync workloads. + * + * Processes doing I/O in the slower disk zones will tend to be + * slow(er) even if not seeky. Hence, since the estimated peak + * rate is actually an average over the disk surface, these + * processes may timeout just for bad luck. To avoid punishing + * them we do not charge a full budget to a process that + * succeeded in consuming at least 2/3 of its budget. + */ + if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) + bfq_bfqq_charge_full_budget(bfqq); + + bfqq->service_from_backlogged += bfqq->entity.service; + + if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && + !bfq_bfqq_constantly_seeky(bfqq)) { + bfq_mark_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) + bfqd->const_seeky_busy_in_flight_queues++; + } + + if (reason == BFQ_BFQQ_TOO_IDLE && + bfqq->entity.service <= 2 * bfqq->entity.budget / 10 ) + bfq_clear_bfqq_IO_bound(bfqq); + + if (bfqd->low_latency && bfqq->wr_coeff == 1) + bfqq->last_wr_start_finish = jiffies; + + if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * If we get here, and there are no outstanding requests, + * then the request pattern is isochronous (see the comments + * to the function bfq_bfqq_softrt_next_start()). Hence we + * can compute soft_rt_next_start. If, instead, the queue + * still has outstanding requests, then we have to wait + * for the completion of all the outstanding requests to + * discover whether the request pattern is actually + * isochronous. + */ + if (bfqq->dispatched == 0) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + else { + /* + * The application is still waiting for the + * completion of one or more requests: + * prevent it from possibly being incorrectly + * deemed as soft real-time by setting its + * soft_rt_next_start to infinity. In fact, + * without this assignment, the application + * would be incorrectly deemed as soft + * real-time if: + * 1) it issued a new request before the + * completion of all its in-flight + * requests, and + * 2) at that time, its soft_rt_next_start + * happened to be in the past. + */ + bfqq->soft_rt_next_start = + bfq_infinity_from_now(jiffies); + /* + * Schedule an update of soft_rt_next_start to when + * the task may be discovered to be isochronous. + */ + bfq_mark_bfqq_softrt_update(bfqq); + } + } + + bfq_log_bfqq(bfqd, bfqq, + "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, + slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + + /* + * Increase, decrease or leave budget unchanged according to + * reason. + */ + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + __bfq_bfqq_expire(bfqd, bfqq); +} + +/* + * Budget timeout is not implemented through a dedicated timer, but + * just checked on request arrivals and completions, as well as on + * idle timer expirations. + */ +static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_budget_new(bfqq) || + time_before(jiffies, bfqq->budget_timeout)) + return 0; + return 1; +} + +/* + * If we expire a queue that is waiting for the arrival of a new + * request, we may prevent the fictitious timestamp back-shifting that + * allows the guarantees of the queue to be preserved (see [1] for + * this tricky aspect). Hence we return true only if this condition + * does not hold, or if the queue is slow enough to deserve only to be + * kicked off for preserving a high throughput. +*/ +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, + "may_budget_timeout: wait_request %d left %d timeout %d", + bfq_bfqq_wait_request(bfqq), + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, + bfq_bfqq_budget_timeout(bfqq)); + + return (!bfq_bfqq_wait_request(bfqq) || + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) + && + bfq_bfqq_budget_timeout(bfqq); +} + +/* + * Device idling is allowed only for the queues for which this function + * returns true. For this reason, the return value of this function plays a + * critical role for both throughput boosting and service guarantees. The + * return value is computed through a logical expression. In this rather + * long comment, we try to briefly describe all the details and motivations + * behind the components of this logical expression. + * + * First, the expression is false if bfqq is not sync, or if: bfqq happened + * to become active during a large burst of queue activations, and the + * pattern of requests bfqq contains boosts the throughput if bfqq is + * expired. In fact, queues that became active during a large burst benefit + * only from throughput, as discussed in the comments to bfq_handle_burst. + * In this respect, expiring bfqq certainly boosts the throughput on NCQ- + * capable flash-based devices, whereas, on rotational devices, it boosts + * the throughput only if bfqq contains random requests. + * + * On the opposite end, if (a) bfqq is sync, (b) the above burst-related + * condition does not hold, and (c) bfqq is being weight-raised, then the + * expression always evaluates to true, as device idling is instrumental + * for preserving low-latency guarantees (see [1]). If, instead, conditions + * (a) and (b) do hold, but (c) does not, then the expression evaluates to + * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and + * (2) at least one of the following two conditions holds. + * The first condition is that the device is not performing NCQ, because + * idling the device most certainly boosts the throughput if this condition + * holds and bfqq is I/O-bound and has been granted a non-null idle window. + * The second compound condition is made of the logical AND of two components. + * + * The first component is true only if there is no weight-raised busy + * queue. This guarantees that the device is not idled for a sync non- + * weight-raised queue when there are busy weight-raised queues. The former + * is then expired immediately if empty. Combined with the timestamping + * rules of BFQ (see [1] for details), this causes sync non-weight-raised + * queues to get a lower number of requests served, and hence to ask for a + * lower number of requests from the request pool, before the busy weight- + * raised queues get served again. + * + * This is beneficial for the processes associated with weight-raised + * queues, when the request pool is saturated (e.g., in the presence of + * write hogs). In fact, if the processes associated with the other queues + * ask for requests at a lower rate, then weight-raised processes have a + * higher probability to get a request from the pool immediately (or at + * least soon) when they need one. Hence they have a higher probability to + * actually get a fraction of the disk throughput proportional to their + * high weight. This is especially true with NCQ-capable drives, which + * enqueue several requests in advance and further reorder internally- + * queued requests. + * + * In the end, mistreating non-weight-raised queues when there are busy + * weight-raised queues seems to mitigate starvation problems in the + * presence of heavy write workloads and NCQ, and hence to guarantee a + * higher application and system responsiveness in these hostile scenarios. + * + * If the first component of the compound condition is instead true, i.e., + * there is no weight-raised busy queue, then the second component of the + * compound condition takes into account service-guarantee and throughput + * issues related to NCQ (recall that the compound condition is evaluated + * only if the device is detected as supporting NCQ). + * + * As for service guarantees, allowing the drive to enqueue more than one + * request at a time, and hence delegating de facto final scheduling + * decisions to the drive's internal scheduler, causes loss of control on + * the actual request service order. In this respect, when the drive is + * allowed to enqueue more than one request at a time, the service + * distribution enforced by the drive's internal scheduler is likely to + * coincide with the desired device-throughput distribution only in the + * following, perfectly symmetric, scenario: + * 1) all active queues have the same weight, + * 2) all active groups at the same level in the groups tree have the same + * weight, + * 3) all active groups at the same level in the groups tree have the same + * number of children. + * + * Even in such a scenario, sequential I/O may still receive a preferential + * treatment, but this is not likely to be a big issue with flash-based + * devices, because of their non-dramatic loss of throughput with random + * I/O. Things do differ with HDDs, for which additional care is taken, as + * explained after completing the discussion for flash-based devices. + * + * Unfortunately, keeping the necessary state for evaluating exactly the + * above symmetry conditions would be quite complex and time-consuming. + * Therefore BFQ evaluates instead the following stronger sub-conditions, + * for which it is much easier to maintain the needed state: + * 1) all active queues have the same weight, + * 2) all active groups have the same weight, + * 3) all active groups have at most one active child each. + * In particular, the last two conditions are always true if hierarchical + * support and the cgroups interface are not enabled, hence no state needs + * to be maintained in this case. + * + * According to the above considerations, the second component of the + * compound condition evaluates to true if any of the above symmetry + * sub-condition does not hold, or the device is not flash-based. Therefore, + * if also the first component is true, then idling is allowed for a sync + * queue. These are the only sub-conditions considered if the device is + * flash-based, as, for such a device, it is sensible to force idling only + * for service-guarantee issues. In fact, as for throughput, idling + * NCQ-capable flash-based devices would not boost the throughput even + * with sequential I/O; rather it would lower the throughput in proportion + * to how fast the device is. In the end, (only) if all the three + * sub-conditions hold and the device is flash-based, the compound + * condition evaluates to false and therefore no idling is performed. + * + * As already said, things change with a rotational device, where idling + * boosts the throughput with sequential I/O (even with NCQ). Hence, for + * such a device the second component of the compound condition evaluates + * to true also if the following additional sub-condition does not hold: + * the queue is constantly seeky. Unfortunately, this different behavior + * with respect to flash-based devices causes an additional asymmetry: if + * some sync queues enjoy idling and some other sync queues do not, then + * the latter get a low share of the device throughput, simply because the + * former get many requests served after being set as in service, whereas + * the latter do not. As a consequence, to guarantee the desired throughput + * distribution, on HDDs the compound expression evaluates to true (and + * hence device idling is performed) also if the following last symmetry + * condition does not hold: no other queue is benefiting from idling. Also + * this last condition is actually replaced with a simpler-to-maintain and + * stronger condition: there is no busy queue which is not constantly seeky + * (and hence may also benefit from idling). + * + * To sum up, when all the required symmetry and throughput-boosting + * sub-conditions hold, the second component of the compound condition + * evaluates to false, and hence no idling is performed. This helps to + * keep the drives' internal queues full on NCQ-capable devices, and hence + * to boost the throughput, without causing 'almost' any loss of service + * guarantees. The 'almost' follows from the fact that, if the internal + * queue of one such device is filled while all the sub-conditions hold, + * but at some point in time some sub-condition stops to hold, then it may + * become impossible to let requests be served in the new desired order + * until all the requests already queued in the device have been served. + */ +static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; +#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \ + bfqd->busy_in_flight_queues == \ + bfqd->const_seeky_busy_in_flight_queues) + +#define cond_for_expiring_in_burst (bfq_bfqq_in_large_burst(bfqq) && \ + bfqd->hw_tag && \ + (blk_queue_nonrot(bfqd->queue) || \ + bfq_bfqq_constantly_seeky(bfqq))) + +/* + * Condition for expiring a non-weight-raised queue (and hence not idling + * the device). + */ +#define cond_for_expiring_non_wr (bfqd->hw_tag && \ + (bfqd->wr_busy_queues > 0 || \ + (blk_queue_nonrot(bfqd->queue) || \ + cond_for_seeky_on_ncq_hdd))) + + return bfq_bfqq_sync(bfqq) && + !cond_for_expiring_in_burst && + (bfqq->wr_coeff > 1 || !symmetric_scenario || + (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) && + !cond_for_expiring_non_wr) + ); +} + +/* + * If the in-service queue is empty but sync, and the function + * bfq_bfqq_must_not_expire returns true, then: + * 1) the queue must remain in service and cannot be expired, and + * 2) the disk must be idled to wait for the possible arrival of a new + * request for the queue. + * See the comments to the function bfq_bfqq_must_not_expire for the reasons + * why performing device idling is the best choice to boost the throughput + * and preserve service guarantees when bfq_bfqq_must_not_expire itself + * returns true. + */ +static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && + bfq_bfqq_must_not_expire(bfqq); +} + +/* + * Select a queue for service. If we have a current queue in service, + * check whether to continue servicing it, or retrieve and set a new one. + */ +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + struct request *next_rq; + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; + + bfqq = bfqd->in_service_queue; + if (bfqq == NULL) + goto new_queue; + + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + + if (bfq_may_expire_for_budg_timeout(bfqq) && + !timer_pending(&bfqd->idle_slice_timer) && + !bfq_bfqq_must_idle(bfqq)) + goto expire; + + next_rq = bfqq->next_rq; + /* + * If bfqq has requests queued and it has enough budget left to + * serve them, keep the queue, otherwise expire it. + */ + if (next_rq != NULL) { + if (bfq_serv_to_charge(next_rq, bfqq) > + bfq_bfqq_budget_left(bfqq)) { + reason = BFQ_BFQQ_BUDGET_EXHAUSTED; + goto expire; + } else { + /* + * The idle timer may be pending because we may + * not disable disk idling even when a new request + * arrives. + */ + if (timer_pending(&bfqd->idle_slice_timer)) { + /* + * If we get here: 1) at least a new request + * has arrived but we have not disabled the + * timer because the request was too small, + * 2) then the block layer has unplugged + * the device, causing the dispatch to be + * invoked. + * + * Since the device is unplugged, now the + * requests are probably large enough to + * provide a reasonable throughput. + * So we disable idling. + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); + } + goto keep_queue; + } + } + + /* + * No requests pending. However, if the in-service queue is idling + * for a new request, or has requests waiting for a completion and + * may idle after their completion, then keep it anyway. + */ + if (timer_pending(&bfqd->idle_slice_timer) || + (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) { + bfqq = NULL; + goto keep_queue; + } + + reason = BFQ_BFQQ_NO_MORE_REQUESTS; +expire: + bfq_bfqq_expire(bfqd, bfqq, 0, reason); +new_queue: + bfqq = bfq_set_in_service_queue(bfqd); + bfq_log(bfqd, "select_queue: new queue %d returned", + bfqq != NULL ? bfqq->pid : 0); +keep_queue: + return bfqq; +} + +static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + + BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != + entity->orig_weight * bfqq->wr_coeff); + if (entity->ioprio_changed) + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); + + /* + * If the queue was activated in a burst, or + * too much time has elapsed from the beginning + * of this weight-raising period, or the queue has + * exceeded the acceptable number of cooperations, + * then end weight raising. + */ + if (bfq_bfqq_in_large_burst(bfqq) || + bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + bfqq->last_wr_start_finish = jiffies; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + bfq_bfqq_end_wr(bfqq); + } + } + /* Update weight both if it must be raised and if it must be lowered */ + if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) + __bfq_entity_update_weight_prio( + bfq_entity_service_tree(entity), + entity); +} + +/* + * Dispatch one request from bfqq, moving it to the request queue + * dispatch list. + */ +static int bfq_dispatch_request(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + int dispatched = 0; + struct request *rq; + unsigned long service_to_charge; + + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + /* Follow expired path, else get first next available. */ + rq = bfq_check_fifo(bfqq); + if (rq == NULL) + rq = bfqq->next_rq; + service_to_charge = bfq_serv_to_charge(rq, bfqq); + + if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { + /* + * This may happen if the next rq is chosen in fifo order + * instead of sector order. The budget is properly + * dimensioned to be always sufficient to serve the next + * request only if it is chosen in sector order. The reason + * is that it would be quite inefficient and little useful + * to always make sure that the budget is large enough to + * serve even the possible next rq in fifo order. + * In fact, requests are seldom served in fifo order. + * + * Expire the queue for budget exhaustion, and make sure + * that the next act_budget is enough to serve the next + * request, even if it comes from the fifo expired path. + */ + bfqq->next_rq = rq; + /* + * Since this dispatch is failed, make sure that + * a new one will be performed + */ + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + goto expire; + } + + /* Finally, insert request into driver dispatch list. */ + bfq_bfqq_served(bfqq, service_to_charge); + bfq_dispatch_insert(bfqd->queue, rq); + + bfq_update_wr_data(bfqd, bfqq); + + bfq_log_bfqq(bfqd, bfqq, + "dispatched %u sec req (%llu), budg left %lu", + blk_rq_sectors(rq), + (long long unsigned)blk_rq_pos(rq), + bfq_bfqq_budget_left(bfqq)); + + dispatched++; + + if (bfqd->in_service_bic == NULL) { + atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); + bfqd->in_service_bic = RQ_BIC(rq); + } + + if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && + dispatched >= bfqd->bfq_max_budget_async_rq) || + bfq_class_idle(bfqq))) + goto expire; + + return dispatched; + +expire: + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); + return dispatched; +} + +static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) +{ + int dispatched = 0; + + while (bfqq->next_rq != NULL) { + bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); + dispatched++; + } + + BUG_ON(!list_empty(&bfqq->fifo)); + return dispatched; +} + +/* + * Drain our current requests. + * Used for barriers and when switching io schedulers on-the-fly. + */ +static int bfq_forced_dispatch(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq, *n; + struct bfq_service_tree *st; + int dispatched = 0; + + bfqq = bfqd->in_service_queue; + if (bfqq != NULL) + __bfq_bfqq_expire(bfqd, bfqq); + + /* + * Loop through classes, and be careful to leave the scheduler + * in a consistent state, as feedback mechanisms and vtime + * updates cannot be disabled during the process. + */ + list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { + st = bfq_entity_service_tree(&bfqq->entity); + + dispatched += __bfq_forced_dispatch_bfqq(bfqq); + bfqq->max_budget = bfq_max_budget(bfqd); + + bfq_forget_idle(st); + } + + BUG_ON(bfqd->busy_queues != 0); + + return dispatched; +} + +static int bfq_dispatch_requests(struct request_queue *q, int force) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + int max_dispatch; + + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + if (bfqd->busy_queues == 0) + return 0; + + if (unlikely(force)) + return bfq_forced_dispatch(bfqd); + + bfqq = bfq_select_queue(bfqd); + if (bfqq == NULL) + return 0; + + if (bfq_class_idle(bfqq)) + max_dispatch = 1; + + if (!bfq_bfqq_sync(bfqq)) + max_dispatch = bfqd->bfq_max_budget_async_rq; + + if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { + if (bfqd->busy_queues > 1) + return 0; + if (bfqq->dispatched >= 4 * max_dispatch) + return 0; + } + + if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) + return 0; + + bfq_clear_bfqq_wait_request(bfqq); + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + + if (!bfq_dispatch_request(bfqd, bfqq)) + return 0; + + bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", + bfq_bfqq_sync(bfqq) ? "sync" : "async"); + + return 1; +} + +/* + * Task holds one reference to the queue, dropped when task exits. Each rq + * in-flight on this queue also holds a reference, dropped when rq is freed. + * + * Queue lock must be held here. + */ +static void bfq_put_queue(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + + BUG_ON(atomic_read(&bfqq->ref) <= 0); + + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, + atomic_read(&bfqq->ref)); + if (!atomic_dec_and_test(&bfqq->ref)) + return; + + BUG_ON(rb_first(&bfqq->sort_list) != NULL); + BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); + BUG_ON(bfqq->entity.tree != NULL); + BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqd->in_service_queue == bfqq); + + if (bfq_bfqq_sync(bfqq)) + /* + * The fact that this queue is being destroyed does not + * invalidate the fact that this queue may have been + * activated during the current burst. As a consequence, + * although the queue does not exist anymore, and hence + * needs to be removed from the burst list if there, + * the burst size has not to be decremented. + */ + hlist_del_init(&bfqq->burst_list_node); + + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); +} + +static void bfq_put_cooperator(struct bfq_queue *bfqq) +{ + struct bfq_queue *__bfqq, *next; + + /* + * If this queue was scheduled to merge with another queue, be + * sure to drop the reference taken on that queue (and others in + * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. + */ + __bfqq = bfqq->new_bfqq; + while (__bfqq) { + if (__bfqq == bfqq) + break; + next = __bfqq->new_bfqq; + bfq_put_queue(__bfqq); + __bfqq = next; + } +} + +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + if (bfqq == bfqd->in_service_queue) { + __bfq_bfqq_expire(bfqd, bfqq); + bfq_schedule_dispatch(bfqd); + } + + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, + atomic_read(&bfqq->ref)); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); +} + +static inline void bfq_init_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + + bic->ttime.last_end_request = jiffies; + /* + * A newly created bic indicates that the process has just + * started doing I/O, and is probably mapping into memory its + * executable and libraries: it definitely needs weight raising. + * There is however the possibility that the process performs, + * for a while, I/O close to some other process. EQM intercepts + * this behavior and may merge the queue corresponding to the + * process with some other queue, BEFORE the weight of the queue + * is raised. Merged queues are not weight-raised (they are assumed + * to belong to processes that benefit only from high throughput). + * If the merge is basically the consequence of an accident, then + * the queue will be split soon and will get back its old weight. + * It is then important to write down somewhere that this queue + * does need weight raising, even if it did not make it to get its + * weight raised before being merged. To this purpose, we overload + * the field raising_time_left and assign 1 to it, to mark the queue + * as needing weight raising. + */ + bic->wr_time_left = 1; +} + +static void bfq_exit_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + struct bfq_data *bfqd = bic_to_bfqd(bic); + + if (bic->bfqq[BLK_RW_ASYNC]) { + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); + bic->bfqq[BLK_RW_ASYNC] = NULL; + } + + if (bic->bfqq[BLK_RW_SYNC]) { + /* + * If the bic is using a shared queue, put the reference + * taken on the io_context when the bic started using a + * shared bfq_queue. + */ + if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) + put_io_context(icq->ioc); + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); + bic->bfqq[BLK_RW_SYNC] = NULL; + } +} + +/* + * Update the entity prio values; note that the new values will not + * be used until the next (re)activation. + */ +static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +{ + struct task_struct *tsk = current; + int ioprio_class; + + ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + switch (ioprio_class) { + default: + dev_err(bfqq->bfqd->queue->backing_dev_info.dev, + "bfq: bad prio class %d\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* + * No prio set, inherit CPU scheduling settings. + */ + bfqq->entity.new_ioprio = task_nice_ioprio(tsk); + bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); + break; + case IOPRIO_CLASS_RT: + bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; + break; + case IOPRIO_CLASS_BE: + bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_IDLE: + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; + bfqq->entity.new_ioprio = 7; + bfq_clear_bfqq_idle_window(bfqq); + break; + } + + if (bfqq->entity.new_ioprio < 0 || + bfqq->entity.new_ioprio >= IOPRIO_BE_NR) { + printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n", + bfqq->entity.new_ioprio); + BUG(); + } + + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio); + bfqq->entity.ioprio_changed = 1; +} + +static void bfq_check_ioprio_change(struct bfq_io_cq *bic) +{ + struct bfq_data *bfqd; + struct bfq_queue *bfqq, *new_bfqq; + struct bfq_group *bfqg; + unsigned long uninitialized_var(flags); + int ioprio = bic->icq.ioc->ioprio; + + bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), + &flags); + /* + * This condition may trigger on a newly created bic, be sure to + * drop the lock before returning. + */ + if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio)) + goto out; + + bic->ioprio = ioprio; + + bfqq = bic->bfqq[BLK_RW_ASYNC]; + if (bfqq != NULL) { + bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, + sched_data); + new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic, + GFP_ATOMIC); + if (new_bfqq != NULL) { + bic->bfqq[BLK_RW_ASYNC] = new_bfqq; + bfq_log_bfqq(bfqd, bfqq, + "check_ioprio_change: bfqq %p %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } + } + + bfqq = bic->bfqq[BLK_RW_SYNC]; + if (bfqq != NULL) + bfq_set_next_ioprio_data(bfqq, bic); + +out: + bfq_put_bfqd_unlock(bfqd, &flags); +} + +static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, pid_t pid, int is_sync) +{ + RB_CLEAR_NODE(&bfqq->entity.rb_node); + INIT_LIST_HEAD(&bfqq->fifo); + INIT_HLIST_NODE(&bfqq->burst_list_node); + + atomic_set(&bfqq->ref, 0); + bfqq->bfqd = bfqd; + + if (bic) + bfq_set_next_ioprio_data(bfqq, bic); + + if (is_sync) { + if (!bfq_class_idle(bfqq)) + bfq_mark_bfqq_idle_window(bfqq); + bfq_mark_bfqq_sync(bfqq); + } + bfq_mark_bfqq_IO_bound(bfqq); + + /* Tentative initial value to trade off between thr and lat */ + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; + bfqq->pid = pid; + + bfqq->wr_coeff = 1; + bfqq->last_wr_start_finish = 0; + /* + * Set to the value for which bfqq will not be deemed as + * soft rt when it becomes backlogged. + */ + bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); +} + +static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int is_sync, + struct bfq_io_cq *bic, + gfp_t gfp_mask) +{ + struct bfq_queue *bfqq, *new_bfqq = NULL; + +retry: + /* bic always exists here */ + bfqq = bic_to_bfqq(bic, is_sync); + + /* + * Always try a new alloc if we fall back to the OOM bfqq + * originally, since it should just be a temporary situation. + */ + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + bfqq = NULL; + if (new_bfqq != NULL) { + bfqq = new_bfqq; + new_bfqq = NULL; + } else if (gfp_mask & __GFP_WAIT) { + spin_unlock_irq(bfqd->queue->queue_lock); + new_bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + spin_lock_irq(bfqd->queue->queue_lock); + if (new_bfqq != NULL) + goto retry; + } else { + bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + } + + if (bfqq != NULL) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, + is_sync); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + } + } + + if (new_bfqq != NULL) + kmem_cache_free(bfq_pool, new_bfqq); + + return bfqq; +} + +static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int ioprio_class, int ioprio) +{ + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + return &bfqg->async_bfqq[0][ioprio]; + case IOPRIO_CLASS_NONE: + ioprio = IOPRIO_NORM; + /* fall through */ + case IOPRIO_CLASS_BE: + return &bfqg->async_bfqq[1][ioprio]; + case IOPRIO_CLASS_IDLE: + return &bfqg->async_idle_bfqq; + default: + BUG(); + } +} + +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, int is_sync, + struct bfq_io_cq *bic, gfp_t gfp_mask) +{ + const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + struct bfq_queue **async_bfqq = NULL; + struct bfq_queue *bfqq = NULL; + + if (!is_sync) { + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, + ioprio); + bfqq = *async_bfqq; + } + + if (bfqq == NULL) + bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask); + + /* + * Pin the queue now that it's allocated, scheduler exit will + * prune it. + */ + if (!is_sync && *async_bfqq == NULL) { + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", + bfqq, atomic_read(&bfqq->ref)); + *async_bfqq = bfqq; + } + + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, + atomic_read(&bfqq->ref)); + return bfqq; +} + +static void bfq_update_io_thinktime(struct bfq_data *bfqd, + struct bfq_io_cq *bic) +{ + unsigned long elapsed = jiffies - bic->ttime.last_end_request; + unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); + + bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; + bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; + bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / + bic->ttime.ttime_samples; +} + +static void bfq_update_io_seektime(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *rq) +{ + sector_t sdist; + u64 total; + + if (bfqq->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - bfqq->last_request_pos; + else + sdist = bfqq->last_request_pos - blk_rq_pos(rq); + + /* + * Don't allow the seek distance to get too large from the + * odd fragment, pagein, etc. + */ + if (bfqq->seek_samples == 0) /* first request, not really a seek */ + sdist = 0; + else if (bfqq->seek_samples <= 60) /* second & third seek */ + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); + else + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); + + bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; + bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; + total = bfqq->seek_total + (bfqq->seek_samples/2); + do_div(total, bfqq->seek_samples); + bfqq->seek_mean = (sector_t)total; + + bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, + (u64)bfqq->seek_mean); +} + +/* + * Disable idle window if the process thinks too long or seeks so much that + * it doesn't matter. + */ +static void bfq_update_idle_window(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + int enable_idle; + + /* Don't idle for async or idle io prio class. */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) + return; + + /* Idle window just restored, statistics are meaningless. */ + if (bfq_bfqq_just_split(bfqq)) + return; + + enable_idle = bfq_bfqq_idle_window(bfqq); + + if (atomic_read(&bic->icq.ioc->active_ref) == 0 || + bfqd->bfq_slice_idle == 0 || + (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && + bfqq->wr_coeff == 1)) + enable_idle = 0; + else if (bfq_sample_valid(bic->ttime.ttime_samples)) { + if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && + bfqq->wr_coeff == 1) + enable_idle = 0; + else + enable_idle = 1; + } + bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", + enable_idle); + + if (enable_idle) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); +} + +/* + * Called when a new fs request (rq) is added to bfqq. Check if there's + * something we should do about it. + */ +static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + struct bfq_io_cq *bic = RQ_BIC(rq); + + if (rq->cmd_flags & REQ_META) + bfqq->meta_pending++; + + bfq_update_io_thinktime(bfqd, bic); + bfq_update_io_seektime(bfqd, bfqq, rq); + if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { + bfq_clear_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || + !BFQQ_SEEKY(bfqq)) + bfq_update_idle_window(bfqd, bfqq, bic); + bfq_clear_bfqq_just_split(bfqq); + + bfq_log_bfqq(bfqd, bfqq, + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), + (long long unsigned)bfqq->seek_mean); + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + + if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { + int small_req = bfqq->queued[rq_is_sync(rq)] == 1 && + blk_rq_sectors(rq) < 32; + int budget_timeout = bfq_bfqq_budget_timeout(bfqq); + + /* + * There is just this request queued: if the request + * is small and the queue is not to be expired, then + * just exit. + * + * In this way, if the disk is being idled to wait for + * a new request from the in-service queue, we avoid + * unplugging the device and committing the disk to serve + * just a small request. On the contrary, we wait for + * the block layer to decide when to unplug the device: + * hopefully, new requests will be merged to this one + * quickly, then the device will be unplugged and + * larger requests will be dispatched. + */ + if (small_req && !budget_timeout) + return; + + /* + * A large enough request arrived, or the queue is to + * be expired: in both cases disk idling is to be + * stopped, so clear wait_request flag and reset + * timer. + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); + + /* + * The queue is not empty, because a new request just + * arrived. Hence we can safely expire the queue, in + * case of budget timeout, without risking that the + * timestamps of the queue are not updated correctly. + * See [1] for more details. + */ + if (budget_timeout) + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); + + /* + * Let the request rip immediately, or let a new queue be + * selected if bfqq has just been expired. + */ + __blk_run_queue(bfqd->queue); + } +} + +static void bfq_insert_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + + assert_spin_locked(bfqd->queue->queue_lock); + + /* + * An unplug may trigger a requeue of a request from the device + * driver: make sure we are in process context while trying to + * merge two bfq_queues. + */ + if (!in_interrupt()) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + if (new_bfqq != NULL) { + if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) + new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); + /* + * Release the request's reference to the old bfqq + * and make sure one is taken to the shared queue. + */ + new_bfqq->allocated[rq_data_dir(rq)]++; + bfqq->allocated[rq_data_dir(rq)]--; + atomic_inc(&new_bfqq->ref); + bfq_put_queue(bfqq); + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); + rq->elv.priv[1] = new_bfqq; + bfqq = new_bfqq; + } else + bfq_bfqq_increase_failed_cooperations(bfqq); + } + + bfq_add_request(rq); + + /* + * Here a newly-created bfq_queue has already started a weight-raising + * period: clear raising_time_left to prevent bfq_bfqq_save_state() + * from assigning it a full weight-raising period. See the detailed + * comments about this field in bfq_init_icq(). + */ + if (bfqq->bic != NULL) + bfqq->bic->wr_time_left = 0; + rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + list_add_tail(&rq->queuelist, &bfqq->fifo); + + bfq_rq_enqueued(bfqd, bfqq, rq); +} + +static void bfq_update_hw_tag(struct bfq_data *bfqd) +{ + bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, + bfqd->rq_in_driver); + + if (bfqd->hw_tag == 1) + return; + + /* + * This sample is valid if the number of outstanding requests + * is large enough to allow a queueing behavior. Note that the + * sum is not exact, as it's not taking into account deactivated + * requests. + */ + if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) + return; + + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) + return; + + bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; + bfqd->max_rq_in_driver = 0; + bfqd->hw_tag_samples = 0; +} + +static void bfq_completed_request(struct request_queue *q, struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + bool sync = bfq_bfqq_sync(bfqq); + + bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", + blk_rq_sectors(rq), sync); + + bfq_update_hw_tag(bfqd); + + BUG_ON(!bfqd->rq_in_driver); + BUG_ON(!bfqq->dispatched); + bfqd->rq_in_driver--; + bfqq->dispatched--; + + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->busy_in_flight_queues); + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) { + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + } + + if (sync) { + bfqd->sync_flight--; + RQ_BIC(rq)->ttime.last_end_request = jiffies; + } + + /* + * If we are waiting to discover whether the request pattern of the + * task associated with the queue is actually isochronous, and + * both requisites for this condition to hold are satisfied, then + * compute soft_rt_next_start (see the comments to the function + * bfq_bfqq_softrt_next_start()). + */ + if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + + /* + * If this is the in-service queue, check if it needs to be expired, + * or if we want to idle in case it has no pending requests. + */ + if (bfqd->in_service_queue == bfqq) { + if (bfq_bfqq_budget_new(bfqq)) + bfq_set_budget_timeout(bfqd); + + if (bfq_bfqq_must_idle(bfqq)) { + bfq_arm_slice_timer(bfqd); + goto out; + } else if (bfq_may_expire_for_budg_timeout(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); + else if (RB_EMPTY_ROOT(&bfqq->sort_list) && + (bfqq->dispatched == 0 || + !bfq_bfqq_must_not_expire(bfqq))) + bfq_bfqq_expire(bfqd, bfqq, 0, + BFQ_BFQQ_NO_MORE_REQUESTS); + } + + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + +out: + return; +} + +static inline int __bfq_may_queue(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { + bfq_clear_bfqq_must_alloc(bfqq); + return ELV_MQUEUE_MUST; + } + + return ELV_MQUEUE_MAY; +} + +static int bfq_may_queue(struct request_queue *q, int rw) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + /* + * Don't force setup of a queue from here, as a call to may_queue + * does not necessarily imply that a request actually will be + * queued. So just lookup a possibly existing queue, or return + * 'may queue' if that fails. + */ + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (bic == NULL) + return ELV_MQUEUE_MAY; + + bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); + if (bfqq != NULL) + return __bfq_may_queue(bfqq); + + return ELV_MQUEUE_MAY; +} + +/* + * Queue lock held here. + */ +static void bfq_put_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + if (bfqq != NULL) { + const int rw = rq_data_dir(rq); + + BUG_ON(!bfqq->allocated[rw]); + bfqq->allocated[rw]--; + + rq->elv.priv[0] = NULL; + rq->elv.priv[1] = NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } +} + +/* + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this + * was the last process referring to said bfqq. + */ +static struct bfq_queue * +bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); + + put_io_context(bic->icq.ioc); + + if (bfqq_process_refs(bfqq) == 1) { + bfqq->pid = current->pid; + bfq_clear_bfqq_coop(bfqq); + bfq_clear_bfqq_split_coop(bfqq); + return bfqq; + } + + bic_set_bfqq(bic, NULL, 1); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); + return NULL; +} + +/* + * Allocate bfq data structures associated with this request. + */ +static int bfq_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); + const int rw = rq_data_dir(rq); + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; + struct bfq_group *bfqg; + unsigned long flags; + bool split = false; + + might_sleep_if(gfp_mask & __GFP_WAIT); + + bfq_check_ioprio_change(bic); + + spin_lock_irqsave(q->queue_lock, flags); + + if (bic == NULL) + goto queue_fail; + + bfqg = bfq_bic_update_cgroup(bic); + +new_queue: + bfqq = bic_to_bfqq(bic, is_sync); + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { + if ((bic->was_in_burst_list && bfqd->large_burst) || + bic->saved_in_large_burst) + bfq_mark_bfqq_in_large_burst(bfqq); + else { + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) + hlist_add_head(&bfqq->burst_list_node, + &bfqd->burst_list); + } + } + } else { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + bfqq = bfq_split_bfqq(bic, bfqq); + split = true; + if (!bfqq) + goto new_queue; + } + } + + bfqq->allocated[rw]++; + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, + atomic_read(&bfqq->ref)); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; + + /* + * If a bfq_queue has only one process reference, it is owned + * by only one bfq_io_cq: we can set the bic field of the + * bfq_queue to the address of that structure. Also, if the + * queue has just been split, mark a flag so that the + * information is available to the other scheduler hooks. + */ + if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; + if (split) { + bfq_mark_bfqq_just_split(bfqq); + /* + * If the queue has just been split from a shared + * queue, restore the idle window and the possible + * weight raising period. + */ + bfq_bfqq_resume_state(bfqq, bic); + } + } + + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; + +queue_fail: + bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(q->queue_lock, flags); + + return 1; +} + +static void bfq_kick_queue(struct work_struct *work) +{ + struct bfq_data *bfqd = + container_of(work, struct bfq_data, unplug_work); + struct request_queue *q = bfqd->queue; + + spin_lock_irq(q->queue_lock); + __blk_run_queue(q); + spin_unlock_irq(q->queue_lock); +} + +/* + * Handler of the expiration of the timer running if the in-service queue + * is idling inside its time slice. + */ +static void bfq_idle_slice_timer(unsigned long data) +{ + struct bfq_data *bfqd = (struct bfq_data *)data; + struct bfq_queue *bfqq; + unsigned long flags; + enum bfqq_expiration reason; + + spin_lock_irqsave(bfqd->queue->queue_lock, flags); + + bfqq = bfqd->in_service_queue; + /* + * Theoretical race here: the in-service queue can be NULL or + * different from the queue that was idling if the timer handler + * spins on the queue_lock and a new request arrives for the + * current queue and there is a full dispatch cycle that changes + * the in-service queue. This can hardly happen, but in the worst + * case we just expire a queue too early. + */ + if (bfqq != NULL) { + bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); + if (bfq_bfqq_budget_timeout(bfqq)) + /* + * Also here the queue can be safely expired + * for budget timeout without wasting + * guarantees + */ + reason = BFQ_BFQQ_BUDGET_TIMEOUT; + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) + /* + * The queue may not be empty upon timer expiration, + * because we may not disable the timer when the + * first request of the in-service queue arrives + * during disk idling. + */ + reason = BFQ_BFQQ_TOO_IDLE; + else + goto schedule_dispatch; + + bfq_bfqq_expire(bfqd, bfqq, 1, reason); + } + +schedule_dispatch: + bfq_schedule_dispatch(bfqd); + + spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); +} + +static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) +{ + del_timer_sync(&bfqd->idle_slice_timer); + cancel_work_sync(&bfqd->unplug_work); +} + +static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_queue **bfqq_ptr) +{ + struct bfq_group *root_group = bfqd->root_group; + struct bfq_queue *bfqq = *bfqq_ptr; + + bfq_log(bfqd, "put_async_bfqq: %p", bfqq); + if (bfqq != NULL) { + bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + *bfqq_ptr = NULL; + } +} + +/* + * Release all the bfqg references to its async queues. If we are + * deallocating the group these queues may still contain requests, so + * we reparent them to the root cgroup (i.e., the only one that will + * exist for sure until all the requests on a device are gone). + */ +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); +} + +static void bfq_exit_queue(struct elevator_queue *e) +{ + struct bfq_data *bfqd = e->elevator_data; + struct request_queue *q = bfqd->queue; + struct bfq_queue *bfqq, *n; + + bfq_shutdown_timer_wq(bfqd); + + spin_lock_irq(q->queue_lock); + + BUG_ON(bfqd->in_service_queue != NULL); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, 0); + + bfq_disconnect_groups(bfqd); + spin_unlock_irq(q->queue_lock); + + bfq_shutdown_timer_wq(bfqd); + + synchronize_rcu(); + + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + + bfq_free_root_group(bfqd); + kfree(bfqd); +} + +static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) +{ + struct bfq_group *bfqg; + struct bfq_data *bfqd; + struct elevator_queue *eq; + + eq = elevator_alloc(q, e); + if (eq == NULL) + return -ENOMEM; + + bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); + if (bfqd == NULL) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = bfqd; + + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. + */ + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); + atomic_inc(&bfqd->oom_bfqq.ref); + bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; + bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE; + bfqd->oom_bfqq.entity.new_weight = + bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio); + /* + * Trigger weight initialization, according to ioprio, at the + * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio + * class won't be changed any more. + */ + bfqd->oom_bfqq.entity.ioprio_changed = 1; + + bfqd->queue = q; + + spin_lock_irq(q->queue_lock); + q->elevator = eq; + spin_unlock_irq(q->queue_lock); + + bfqg = bfq_alloc_root_group(bfqd, q->node); + if (bfqg == NULL) { + kfree(bfqd); + kobject_put(&eq->kobj); + return -ENOMEM; + } + + bfqd->root_group = bfqg; + bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); +#ifdef CONFIG_CGROUP_BFQIO + bfqd->active_numerous_groups = 0; +#endif + + init_timer(&bfqd->idle_slice_timer); + bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + bfqd->idle_slice_timer.data = (unsigned long)bfqd; + + bfqd->rq_pos_tree = RB_ROOT; + bfqd->queue_weights_tree = RB_ROOT; + bfqd->group_weights_tree = RB_ROOT; + + INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); + + INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->idle_list); + INIT_HLIST_HEAD(&bfqd->burst_list); + + bfqd->hw_tag = -1; + + bfqd->bfq_max_budget = bfq_default_max_budget; + + bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; + bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; + bfqd->bfq_back_max = bfq_back_max; + bfqd->bfq_back_penalty = bfq_back_penalty; + bfqd->bfq_slice_idle = bfq_slice_idle; + bfqd->bfq_class_idle_last_service = 0; + bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; + bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; + bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + + bfqd->bfq_coop_thresh = 2; + bfqd->bfq_failed_cooperations = 7000; + bfqd->bfq_requests_within_timer = 120; + + bfqd->bfq_large_burst_thresh = 11; + bfqd->bfq_burst_interval = msecs_to_jiffies(500); + + bfqd->low_latency = true; + + bfqd->bfq_wr_coeff = 20; + bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); + bfqd->bfq_wr_max_time = 0; + bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); + bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); + bfqd->bfq_wr_max_softrt_rate = 7000; /* + * Approximate rate required + * to playback or record a + * high-definition compressed + * video. + */ + bfqd->wr_busy_queues = 0; + bfqd->busy_in_flight_queues = 0; + bfqd->const_seeky_busy_in_flight_queues = 0; + + /* + * Begin by assuming, optimistically, that the device peak rate is + * equal to the highest reference rate. + */ + bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * + T_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->device_speed = BFQ_BFQD_FAST; + + return 0; +} + +static void bfq_slab_kill(void) +{ + if (bfq_pool != NULL) + kmem_cache_destroy(bfq_pool); +} + +static int __init bfq_slab_setup(void) +{ + bfq_pool = KMEM_CACHE(bfq_queue, 0); + if (bfq_pool == NULL) + return -ENOMEM; + return 0; +} + +static ssize_t bfq_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t bfq_var_store(unsigned long *var, const char *page, + size_t count) +{ + unsigned long new_val; + int ret = kstrtoul(page, 10, &new_val); + + if (ret == 0) + *var = new_val; + + return count; +} + +static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) +{ + struct bfq_data *bfqd = e->elevator_data; + return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? + jiffies_to_msecs(bfqd->bfq_wr_max_time) : + jiffies_to_msecs(bfq_wr_duration(bfqd))); +} + +static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) +{ + struct bfq_queue *bfqq; + struct bfq_data *bfqd = e->elevator_data; + ssize_t num_char = 0; + + num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", + bfqd->queued); + + spin_lock_irq(bfqd->queue->queue_lock); + + num_char += sprintf(page + num_char, "Active:\n"); + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + bfqq->queued[0], + bfqq->queued[1], + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + num_char += sprintf(page + num_char, "Idle:\n"); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + jiffies_to_msecs(jiffies - + bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + spin_unlock_irq(bfqd->queue->queue_lock); + + return num_char; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return bfq_var_show(__data, (page)); \ +} +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); +SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); +SHOW_FUNCTION(bfq_max_budget_async_rq_show, + bfqd->bfq_max_budget_async_rq, 0); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); +SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); +SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); +SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); +SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); +SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, + 1); +SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t \ +__FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long uninitialized_var(__data); \ + int ret = bfq_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, + INT_MAX, 1); +STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, + INT_MAX, 1); +STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); +STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, + INT_MAX, 0); +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, + 1, INT_MAX, 0); +STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); +STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, + 1); +STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, + &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, + INT_MAX, 0); +#undef STORE_FUNCTION + +/* do nothing for the moment */ +static ssize_t bfq_weights_store(struct elevator_queue *e, + const char *page, size_t count) +{ + return count; +} + +static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) +{ + u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) + return bfq_calc_max_budget(bfqd->peak_rate, timeout); + else + return bfq_default_max_budget; +} + +static ssize_t bfq_max_budget_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data == 0) + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + else { + if (__data > INT_MAX) + __data = INT_MAX; + bfqd->bfq_max_budget = __data; + } + + bfqd->bfq_user_max_budget = __data; + + return ret; +} + +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data < 1) + __data = 1; + else if (__data > INT_MAX) + __data = INT_MAX; + + bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); + if (bfqd->bfq_user_max_budget == 0) + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + + return ret; +} + +static ssize_t bfq_low_latency_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + if (__data == 0 && bfqd->low_latency != 0) + bfq_end_wr(bfqd); + bfqd->low_latency = __data; + + return ret; +} + +#define BFQ_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) + +static struct elv_fs_entry bfq_attrs[] = { + BFQ_ATTR(fifo_expire_sync), + BFQ_ATTR(fifo_expire_async), + BFQ_ATTR(back_seek_max), + BFQ_ATTR(back_seek_penalty), + BFQ_ATTR(slice_idle), + BFQ_ATTR(max_budget), + BFQ_ATTR(max_budget_async_rq), + BFQ_ATTR(timeout_sync), + BFQ_ATTR(timeout_async), + BFQ_ATTR(low_latency), + BFQ_ATTR(wr_coeff), + BFQ_ATTR(wr_max_time), + BFQ_ATTR(wr_rt_max_time), + BFQ_ATTR(wr_min_idle_time), + BFQ_ATTR(wr_min_inter_arr_async), + BFQ_ATTR(wr_max_softrt_rate), + BFQ_ATTR(weights), + __ATTR_NULL +}; + +static struct elevator_type iosched_bfq = { + .ops = { + .elevator_merge_fn = bfq_merge, + .elevator_merged_fn = bfq_merged_request, + .elevator_merge_req_fn = bfq_merged_requests, + .elevator_allow_merge_fn = bfq_allow_merge, + .elevator_dispatch_fn = bfq_dispatch_requests, + .elevator_add_req_fn = bfq_insert_request, + .elevator_activate_req_fn = bfq_activate_request, + .elevator_deactivate_req_fn = bfq_deactivate_request, + .elevator_completed_req_fn = bfq_completed_request, + .elevator_former_req_fn = elv_rb_former_request, + .elevator_latter_req_fn = elv_rb_latter_request, + .elevator_init_icq_fn = bfq_init_icq, + .elevator_exit_icq_fn = bfq_exit_icq, + .elevator_set_req_fn = bfq_set_request, + .elevator_put_req_fn = bfq_put_request, + .elevator_may_queue_fn = bfq_may_queue, + .elevator_init_fn = bfq_init_queue, + .elevator_exit_fn = bfq_exit_queue, + }, + .icq_size = sizeof(struct bfq_io_cq), + .icq_align = __alignof__(struct bfq_io_cq), + .elevator_attrs = bfq_attrs, + .elevator_name = "bfq", + .elevator_owner = THIS_MODULE, +}; + +static int __init bfq_init(void) +{ + /* + * Can be 0 on HZ < 1000 setups. + */ + if (bfq_slice_idle == 0) + bfq_slice_idle = 1; + + if (bfq_timeout_async == 0) + bfq_timeout_async = 1; + + if (bfq_slab_setup()) + return -ENOMEM; + + /* + * Times to load large popular applications for the typical systems + * installed on the reference devices (see the comments before the + * definitions of the two arrays). + */ + T_slow[0] = msecs_to_jiffies(2600); + T_slow[1] = msecs_to_jiffies(1000); + T_fast[0] = msecs_to_jiffies(5500); + T_fast[1] = msecs_to_jiffies(2000); + + /* + * Thresholds that determine the switch between speed classes (see + * the comments before the definition of the array). + */ + device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; + device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; + + elv_register(&iosched_bfq); + pr_info("BFQ I/O-scheduler: v7r8"); + + return 0; +} + +static void __exit bfq_exit(void) +{ + elv_unregister(&iosched_bfq); + bfq_slab_kill(); +} + +module_init(bfq_init); +module_exit(bfq_exit); + +MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); +MODULE_LICENSE("GPL"); diff -Nur linux-4.2/block/bfq-sched.c linux-4.2-pck/block/bfq-sched.c --- linux-4.2/block/bfq-sched.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/block/bfq-sched.c 2015-09-08 00:48:22.218364355 -0300 @@ -0,0 +1,1180 @@ +/* + * BFQ: Hierarchical B-WF2Q+ scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + */ + +#ifdef CONFIG_CGROUP_BFQIO +#define for_each_entity(entity) \ + for (; entity != NULL; entity = entity->parent) + +#define for_each_entity_safe(entity, parent) \ + for (; entity && ({ parent = entity->parent; 1; }); entity = parent) + +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd); + +static inline void bfq_update_budget(struct bfq_entity *next_in_service) +{ + struct bfq_entity *bfqg_entity; + struct bfq_group *bfqg; + struct bfq_sched_data *group_sd; + + BUG_ON(next_in_service == NULL); + + group_sd = next_in_service->sched_data; + + bfqg = container_of(group_sd, struct bfq_group, sched_data); + /* + * bfq_group's my_entity field is not NULL only if the group + * is not the root group. We must not touch the root entity + * as it must never become an in-service entity. + */ + bfqg_entity = bfqg->my_entity; + if (bfqg_entity != NULL) + bfqg_entity->budget = next_in_service->budget; +} + +static int bfq_update_next_in_service(struct bfq_sched_data *sd) +{ + struct bfq_entity *next_in_service; + + if (sd->in_service_entity != NULL) + /* will update/requeue at the end of service */ + return 0; + + /* + * NOTE: this can be improved in many ways, such as returning + * 1 (and thus propagating upwards the update) only when the + * budget changes, or caching the bfqq that will be scheduled + * next from this subtree. By now we worry more about + * correctness than about performance... + */ + next_in_service = bfq_lookup_next_entity(sd, 0, NULL); + sd->next_in_service = next_in_service; + + if (next_in_service != NULL) + bfq_update_budget(next_in_service); + + return 1; +} + +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) +{ + BUG_ON(sd->next_in_service != entity); +} +#else +#define for_each_entity(entity) \ + for (; entity != NULL; entity = NULL) + +#define for_each_entity_safe(entity, parent) \ + for (parent = NULL; entity != NULL; entity = parent) + +static inline int bfq_update_next_in_service(struct bfq_sched_data *sd) +{ + return 0; +} + +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) +{ +} + +static inline void bfq_update_budget(struct bfq_entity *next_in_service) +{ +} +#endif + +/* + * Shift for timestamp calculations. This actually limits the maximum + * service allowed in one timestamp delta (small shift values increase it), + * the maximum total weight that can be used for the queues in the system + * (big shift values increase it), and the period of virtual time + * wraparounds. + */ +#define WFQ_SERVICE_SHIFT 22 + +/** + * bfq_gt - compare two timestamps. + * @a: first ts. + * @b: second ts. + * + * Return @a > @b, dealing with wrapping correctly. + */ +static inline int bfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = NULL; + + BUG_ON(entity == NULL); + + if (entity->my_sched_data == NULL) + bfqq = container_of(entity, struct bfq_queue, entity); + + return bfqq; +} + + +/** + * bfq_delta - map service into the virtual time domain. + * @service: amount of service. + * @weight: scale factor (weight of an entity or weight sum). + */ +static inline u64 bfq_delta(unsigned long service, + unsigned long weight) +{ + u64 d = (u64)service << WFQ_SERVICE_SHIFT; + + do_div(d, weight); + return d; +} + +/** + * bfq_calc_finish - assign the finish time to an entity. + * @entity: the entity to act upon. + * @service: the service to be charged to the entity. + */ +static inline void bfq_calc_finish(struct bfq_entity *entity, + unsigned long service) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(entity->weight == 0); + + entity->finish = entity->start + + bfq_delta(service, entity->weight); + + if (bfqq != NULL) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: start %llu, finish %llu, delta %llu", + entity->start, entity->finish, + bfq_delta(service, entity->weight)); + } +} + +/** + * bfq_entity_of - get an entity from a node. + * @node: the node field of the entity. + * + * Convert a node pointer to the relative entity. This is used only + * to simplify the logic of some functions and not as the generic + * conversion mechanism because, e.g., in the tree walking functions, + * the check for a %NULL value would be redundant. + */ +static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) +{ + struct bfq_entity *entity = NULL; + + if (node != NULL) + entity = rb_entry(node, struct bfq_entity, rb_node); + + return entity; +} + +/** + * bfq_extract - remove an entity from a tree. + * @root: the tree root. + * @entity: the entity to remove. + */ +static inline void bfq_extract(struct rb_root *root, + struct bfq_entity *entity) +{ + BUG_ON(entity->tree != root); + + entity->tree = NULL; + rb_erase(&entity->rb_node, root); +} + +/** + * bfq_idle_extract - extract an entity from the idle tree. + * @st: the service tree of the owning @entity. + * @entity: the entity being removed. + */ +static void bfq_idle_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *next; + + BUG_ON(entity->tree != &st->idle); + + if (entity == st->first_idle) { + next = rb_next(&entity->rb_node); + st->first_idle = bfq_entity_of(next); + } + + if (entity == st->last_idle) { + next = rb_prev(&entity->rb_node); + st->last_idle = bfq_entity_of(next); + } + + bfq_extract(&st->idle, entity); + + if (bfqq != NULL) + list_del(&bfqq->bfqq_list); +} + +/** + * bfq_insert - generic tree insertion. + * @root: tree root. + * @entity: entity to insert. + * + * This is used for the idle and the active tree, since they are both + * ordered by finish time. + */ +static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) +{ + struct bfq_entity *entry; + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + + BUG_ON(entity->tree != NULL); + + while (*node != NULL) { + parent = *node; + entry = rb_entry(parent, struct bfq_entity, rb_node); + + if (bfq_gt(entry->finish, entity->finish)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&entity->rb_node, parent, node); + rb_insert_color(&entity->rb_node, root); + + entity->tree = root; +} + +/** + * bfq_update_min - update the min_start field of a entity. + * @entity: the entity to update. + * @node: one of its children. + * + * This function is called when @entity may store an invalid value for + * min_start due to updates to the active tree. The function assumes + * that the subtree rooted at @node (which may be its left or its right + * child) has a valid min_start value. + */ +static inline void bfq_update_min(struct bfq_entity *entity, + struct rb_node *node) +{ + struct bfq_entity *child; + + if (node != NULL) { + child = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entity->min_start, child->min_start)) + entity->min_start = child->min_start; + } +} + +/** + * bfq_update_active_node - recalculate min_start. + * @node: the node to update. + * + * @node may have changed position or one of its children may have moved, + * this function updates its min_start value. The left and right subtrees + * are assumed to hold a correct min_start value. + */ +static inline void bfq_update_active_node(struct rb_node *node) +{ + struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); + + entity->min_start = entity->start; + bfq_update_min(entity, node->rb_right); + bfq_update_min(entity, node->rb_left); +} + +/** + * bfq_update_active_tree - update min_start for the whole active tree. + * @node: the starting node. + * + * @node must be the deepest modified node after an update. This function + * updates its min_start using the values held by its children, assuming + * that they did not change, and then updates all the nodes that may have + * changed in the path to the root. The only nodes that may have changed + * are the ones in the path or their siblings. + */ +static void bfq_update_active_tree(struct rb_node *node) +{ + struct rb_node *parent; + +up: + bfq_update_active_node(node); + + parent = rb_parent(node); + if (parent == NULL) + return; + + if (node == parent->rb_left && parent->rb_right != NULL) + bfq_update_active_node(parent->rb_right); + else if (parent->rb_left != NULL) + bfq_update_active_node(parent->rb_left); + + node = parent; + goto up; +} + +static void bfq_weights_tree_add(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); + +static void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); + + +/** + * bfq_active_insert - insert an entity in the active tree of its + * group/device. + * @st: the service tree of the entity. + * @entity: the entity being inserted. + * + * The active tree is ordered by finish time, but an extra key is kept + * per each node, containing the minimum value for the start times of + * its children (and the node itself), so it's possible to search for + * the eligible node with the lowest finish time in logarithmic time. + */ +static void bfq_active_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node = &entity->rb_node; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + bfq_insert(&st->active, entity); + + if (node->rb_left != NULL) + node = node->rb_left; + else if (node->rb_right != NULL) + node = node->rb_right; + + bfq_update_active_tree(node); + +#ifdef CONFIG_CGROUP_BFQIO + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq != NULL) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); +#ifdef CONFIG_CGROUP_BFQIO + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); + } + if (bfqg != bfqd->root_group) { + BUG_ON(!bfqg); + BUG_ON(!bfqd); + bfqg->active_entities++; + if (bfqg->active_entities == 2) + bfqd->active_numerous_groups++; + } +#endif +} + +/** + * bfq_ioprio_to_weight - calc a weight from an ioprio. + * @ioprio: the ioprio value to convert. + */ +static inline unsigned short bfq_ioprio_to_weight(int ioprio) +{ + BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); + return IOPRIO_BE_NR - ioprio; +} + +/** + * bfq_weight_to_ioprio - calc an ioprio from a weight. + * @weight: the weight value to convert. + * + * To preserve as mush as possible the old only-ioprio user interface, + * 0 is used as an escape ioprio value for weights (numerically) equal or + * larger than IOPRIO_BE_NR + */ +static inline unsigned short bfq_weight_to_ioprio(int weight) +{ + BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); + return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; +} + +static inline void bfq_get_entity(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + if (bfqq != NULL) { + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", + bfqq, atomic_read(&bfqq->ref)); + } +} + +/** + * bfq_find_deepest - find the deepest node that an extraction can modify. + * @node: the node being removed. + * + * Do the first step of an extraction in an rb tree, looking for the + * node that will replace @node, and returning the deepest node that + * the following modifications to the tree can touch. If @node is the + * last node in the tree return %NULL. + */ +static struct rb_node *bfq_find_deepest(struct rb_node *node) +{ + struct rb_node *deepest; + + if (node->rb_right == NULL && node->rb_left == NULL) + deepest = rb_parent(node); + else if (node->rb_right == NULL) + deepest = node->rb_left; + else if (node->rb_left == NULL) + deepest = node->rb_right; + else { + deepest = rb_next(node); + if (deepest->rb_right != NULL) + deepest = deepest->rb_right; + else if (rb_parent(deepest) != node) + deepest = rb_parent(deepest); + } + + return deepest; +} + +/** + * bfq_active_extract - remove an entity from the active tree. + * @st: the service_tree containing the tree. + * @entity: the entity being removed. + */ +static void bfq_active_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + node = bfq_find_deepest(&entity->rb_node); + bfq_extract(&st->active, entity); + + if (node != NULL) + bfq_update_active_tree(node); + +#ifdef CONFIG_CGROUP_BFQIO + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq != NULL) + list_del(&bfqq->bfqq_list); +#ifdef CONFIG_CGROUP_BFQIO + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_remove(bfqd, entity, + &bfqd->group_weights_tree); + } + if (bfqg != bfqd->root_group) { + BUG_ON(!bfqg); + BUG_ON(!bfqd); + BUG_ON(!bfqg->active_entities); + bfqg->active_entities--; + if (bfqg->active_entities == 1) { + BUG_ON(!bfqd->active_numerous_groups); + bfqd->active_numerous_groups--; + } + } +#endif +} + +/** + * bfq_idle_insert - insert an entity into the idle tree. + * @st: the service tree containing the tree. + * @entity: the entity to insert. + */ +static void bfq_idle_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) + st->first_idle = entity; + if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) + st->last_idle = entity; + + bfq_insert(&st->idle, entity); + + if (bfqq != NULL) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); +} + +/** + * bfq_forget_entity - remove an entity from the wfq trees. + * @st: the service tree. + * @entity: the entity being removed. + * + * Update the device status and forget everything about @entity, putting + * the device reference to it, if it is a queue. Entities belonging to + * groups are not refcounted. + */ +static void bfq_forget_entity(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_sched_data *sd; + + BUG_ON(!entity->on_st); + + entity->on_st = 0; + st->wsum -= entity->weight; + if (bfqq != NULL) { + sd = entity->sched_data; + bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } +} + +/** + * bfq_put_idle_entity - release the idle tree ref of an entity. + * @st: service tree for the entity. + * @entity: the entity being released. + */ +static void bfq_put_idle_entity(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + bfq_idle_extract(st, entity); + bfq_forget_entity(st, entity); +} + +/** + * bfq_forget_idle - update the idle tree if necessary. + * @st: the service tree to act upon. + * + * To preserve the global O(log N) complexity we only remove one entry here; + * as the idle tree will not grow indefinitely this can be done safely. + */ +static void bfq_forget_idle(struct bfq_service_tree *st) +{ + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && + !bfq_gt(last_idle->finish, st->vtime)) { + /* + * Forget the whole idle tree, increasing the vtime past + * the last finish time of idle entities. + */ + st->vtime = last_idle->finish; + } + + if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) + bfq_put_idle_entity(st, first_idle); +} + +static struct bfq_service_tree * +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + struct bfq_entity *entity) +{ + struct bfq_service_tree *new_st = old_st; + + if (entity->ioprio_changed) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned short prev_weight, new_weight; + struct bfq_data *bfqd = NULL; + struct rb_root *root; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd; + struct bfq_group *bfqg; +#endif + + if (bfqq != NULL) + bfqd = bfqq->bfqd; +#ifdef CONFIG_CGROUP_BFQIO + else { + sd = entity->my_sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; + BUG_ON(!bfqd); + } +#endif + + BUG_ON(old_st->wsum < entity->weight); + old_st->wsum -= entity->weight; + + if (entity->new_weight != entity->orig_weight) { + if (entity->new_weight < BFQ_MIN_WEIGHT || + entity->new_weight > BFQ_MAX_WEIGHT) { + printk(KERN_CRIT "update_weight_prio: " + "new_weight %d\n", + entity->new_weight); + BUG(); + } + entity->orig_weight = entity->new_weight; + entity->ioprio = + bfq_weight_to_ioprio(entity->orig_weight); + } + + entity->ioprio_class = entity->new_ioprio_class; + entity->ioprio_changed = 0; + + /* + * NOTE: here we may be changing the weight too early, + * this will cause unfairness. The correct approach + * would have required additional complexity to defer + * weight changes to the proper time instants (i.e., + * when entity->finish <= old_st->vtime). + */ + new_st = bfq_entity_service_tree(entity); + + prev_weight = entity->weight; + new_weight = entity->orig_weight * + (bfqq != NULL ? bfqq->wr_coeff : 1); + /* + * If the weight of the entity changes, remove the entity + * from its old weight counter (if there is a counter + * associated with the entity), and add it to the counter + * associated with its new weight. + */ + if (prev_weight != new_weight) { + root = bfqq ? &bfqd->queue_weights_tree : + &bfqd->group_weights_tree; + bfq_weights_tree_remove(bfqd, entity, root); + } + entity->weight = new_weight; + /* + * Add the entity to its weights tree only if it is + * not associated with a weight-raised queue. + */ + if (prev_weight != new_weight && + (bfqq ? bfqq->wr_coeff == 1 : 1)) + /* If we get here, root has been initialized. */ + bfq_weights_tree_add(bfqd, entity, root); + + new_st->wsum += entity->weight; + + if (new_st != old_st) + entity->start = new_st->vtime; + } + + return new_st; +} + +/** + * bfq_bfqq_served - update the scheduler status after selection for + * service. + * @bfqq: the queue being served. + * @served: bytes to transfer. + * + * NOTE: this can be optimized, as the timestamps of upper level entities + * are synchronized every time a new bfqq is selected for service. By now, + * we keep it to better check consistency. + */ +static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st; + + for_each_entity(entity) { + st = bfq_entity_service_tree(entity); + + entity->service += served; + BUG_ON(entity->service > entity->budget); + BUG_ON(st->wsum == 0); + + st->vtime += bfq_delta(served, st->wsum); + bfq_forget_idle(st); + } + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); +} + +/** + * bfq_bfqq_charge_full_budget - set the service to the entity budget. + * @bfqq: the queue that needs a service update. + * + * When it's not possible to be fair in the service domain, because + * a queue is not consuming its budget fast enough (the meaning of + * fast depends on the timeout parameter), we charge it a full + * budget. In this way we should obtain a sort of time-domain + * fairness among all the seeky/slow queues. + */ +static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); + + bfq_bfqq_served(bfqq, entity->budget - entity->service); +} + +/** + * __bfq_activate_entity - activate an entity. + * @entity: the entity being activated. + * + * Called whenever an entity is activated, i.e., it is not active and one + * of its children receives a new request, or has to be reactivated due to + * budget exhaustion. It uses the current budget of the entity (and the + * service received if @entity is active) of the queue to calculate its + * timestamps. + */ +static void __bfq_activate_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + if (entity == sd->in_service_entity) { + BUG_ON(entity->tree != NULL); + /* + * If we are requeueing the current entity we have + * to take care of not charging to it service it has + * not received. + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; + sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) { + /* + * Requeueing an entity due to a change of some + * next_in_service entity below it. We reuse the + * old start time. + */ + bfq_active_extract(st, entity); + } else if (entity->tree == &st->idle) { + /* + * Must be on the idle tree, bfq_idle_extract() will + * check for that. + */ + bfq_idle_extract(st, entity); + entity->start = bfq_gt(st->vtime, entity->finish) ? + st->vtime : entity->finish; + } else { + /* + * The finish time of the entity may be invalid, and + * it is in the past for sure, otherwise the queue + * would have been on the idle tree. + */ + entity->start = st->vtime; + st->wsum += entity->weight; + bfq_get_entity(entity); + + BUG_ON(entity->on_st); + entity->on_st = 1; + } + + st = __bfq_entity_update_weight_prio(st, entity); + bfq_calc_finish(entity, entity->budget); + bfq_active_insert(st, entity); +} + +/** + * bfq_activate_entity - activate an entity and its ancestors if necessary. + * @entity: the entity to activate. + * + * Activate @entity and all the entities on the path from it to the root. + */ +static void bfq_activate_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd; + + for_each_entity(entity) { + __bfq_activate_entity(entity); + + sd = entity->sched_data; + if (!bfq_update_next_in_service(sd)) + /* + * No need to propagate the activation to the + * upper entities, as they will be updated when + * the in-service entity is rescheduled. + */ + break; + } +} + +/** + * __bfq_deactivate_entity - deactivate an entity from its service tree. + * @entity: the entity to deactivate. + * @requeue: if false, the entity will not be put into the idle tree. + * + * Deactivate an entity, independently from its previous state. If the + * entity was not on a service tree just return, otherwise if it is on + * any scheduler tree, extract it from that tree, and if necessary + * and if the caller did not specify @requeue, put it on the idle tree. + * + * Return %1 if the caller should update the entity hierarchy, i.e., + * if the entity was in service or if it was the next_in_service for + * its sched_data; return %0 otherwise. + */ +static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + int was_in_service = entity == sd->in_service_entity; + int ret = 0; + + if (!entity->on_st) + return 0; + + BUG_ON(was_in_service && entity->tree != NULL); + + if (was_in_service) { + bfq_calc_finish(entity, entity->service); + sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) + bfq_active_extract(st, entity); + else if (entity->tree == &st->idle) + bfq_idle_extract(st, entity); + else if (entity->tree != NULL) + BUG(); + + if (was_in_service || sd->next_in_service == entity) + ret = bfq_update_next_in_service(sd); + + if (!requeue || !bfq_gt(entity->finish, st->vtime)) + bfq_forget_entity(st, entity); + else + bfq_idle_insert(st, entity); + + BUG_ON(sd->in_service_entity == entity); + BUG_ON(sd->next_in_service == entity); + + return ret; +} + +/** + * bfq_deactivate_entity - deactivate an entity. + * @entity: the entity to deactivate. + * @requeue: true if the entity can be put on the idle tree + */ +static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +{ + struct bfq_sched_data *sd; + struct bfq_entity *parent; + + for_each_entity_safe(entity, parent) { + sd = entity->sched_data; + + if (!__bfq_deactivate_entity(entity, requeue)) + /* + * The parent entity is still backlogged, and + * we don't need to update it as it is still + * in service. + */ + break; + + if (sd->next_in_service != NULL) + /* + * The parent entity is still backlogged and + * the budgets on the path towards the root + * need to be updated. + */ + goto update; + + /* + * If we reach there the parent is no more backlogged and + * we want to propagate the dequeue upwards. + */ + requeue = 1; + } + + return; + +update: + entity = parent; + for_each_entity(entity) { + __bfq_activate_entity(entity); + + sd = entity->sched_data; + if (!bfq_update_next_in_service(sd)) + break; + } +} + +/** + * bfq_update_vtime - update vtime if necessary. + * @st: the service tree to act upon. + * + * If necessary update the service tree vtime to have at least one + * eligible entity, skipping to its start time. Assumes that the + * active tree of the device is not empty. + * + * NOTE: this hierarchical implementation updates vtimes quite often, + * we may end up with reactivated processes getting timestamps after a + * vtime skip done because we needed a ->first_active entity on some + * intermediate node. + */ +static void bfq_update_vtime(struct bfq_service_tree *st) +{ + struct bfq_entity *entry; + struct rb_node *node = st->active.rb_node; + + entry = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entry->min_start, st->vtime)) { + st->vtime = entry->min_start; + bfq_forget_idle(st); + } +} + +/** + * bfq_first_active_entity - find the eligible entity with + * the smallest finish time + * @st: the service tree to select from. + * + * This function searches the first schedulable entity, starting from the + * root of the tree and going on the left every time on this side there is + * a subtree with at least one eligible (start >= vtime) entity. The path on + * the right is followed only if a) the left subtree contains no eligible + * entities and b) no eligible entity has been found yet. + */ +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) +{ + struct bfq_entity *entry, *first = NULL; + struct rb_node *node = st->active.rb_node; + + while (node != NULL) { + entry = rb_entry(node, struct bfq_entity, rb_node); +left: + if (!bfq_gt(entry->start, st->vtime)) + first = entry; + + BUG_ON(bfq_gt(entry->min_start, st->vtime)); + + if (node->rb_left != NULL) { + entry = rb_entry(node->rb_left, + struct bfq_entity, rb_node); + if (!bfq_gt(entry->min_start, st->vtime)) { + node = node->rb_left; + goto left; + } + } + if (first != NULL) + break; + node = node->rb_right; + } + + BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); + return first; +} + +/** + * __bfq_lookup_next_entity - return the first eligible entity in @st. + * @st: the service tree. + * + * Update the virtual time in @st and return the first eligible entity + * it contains. + */ +static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, + bool force) +{ + struct bfq_entity *entity, *new_next_in_service = NULL; + + if (RB_EMPTY_ROOT(&st->active)) + return NULL; + + bfq_update_vtime(st); + entity = bfq_first_active_entity(st); + BUG_ON(bfq_gt(entity->start, st->vtime)); + + /* + * If the chosen entity does not match with the sched_data's + * next_in_service and we are forcedly serving the IDLE priority + * class tree, bubble up budget update. + */ + if (unlikely(force && entity != entity->sched_data->next_in_service)) { + new_next_in_service = entity; + for_each_entity(new_next_in_service) + bfq_update_budget(new_next_in_service); + } + + return entity; +} + +/** + * bfq_lookup_next_entity - return the first eligible entity in @sd. + * @sd: the sched_data. + * @extract: if true the returned entity will be also extracted from @sd. + * + * NOTE: since we cache the next_in_service entity at each level of the + * hierarchy, the complexity of the lookup can be decreased with + * absolutely no effort just returning the cached next_in_service value; + * we prefer to do full lookups to test the consistency of * the data + * structures. + */ +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd) +{ + struct bfq_service_tree *st = sd->service_tree; + struct bfq_entity *entity; + int i = 0; + + BUG_ON(sd->in_service_entity != NULL); + + if (bfqd != NULL && + jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { + entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, + true); + if (entity != NULL) { + i = BFQ_IOPRIO_CLASSES - 1; + bfqd->bfq_class_idle_last_service = jiffies; + sd->next_in_service = entity; + } + } + for (; i < BFQ_IOPRIO_CLASSES; i++) { + entity = __bfq_lookup_next_entity(st + i, false); + if (entity != NULL) { + if (extract) { + bfq_check_next_in_service(sd, entity); + bfq_active_extract(st + i, entity); + sd->in_service_entity = entity; + sd->next_in_service = NULL; + } + break; + } + } + + return entity; +} + +/* + * Get next queue for service. + */ +static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) +{ + struct bfq_entity *entity = NULL; + struct bfq_sched_data *sd; + struct bfq_queue *bfqq; + + BUG_ON(bfqd->in_service_queue != NULL); + + if (bfqd->busy_queues == 0) + return NULL; + + sd = &bfqd->root_group->sched_data; + for (; sd != NULL; sd = entity->my_sched_data) { + entity = bfq_lookup_next_entity(sd, 1, bfqd); + BUG_ON(entity == NULL); + entity->service = 0; + } + + bfqq = bfq_entity_to_bfqq(entity); + BUG_ON(bfqq == NULL); + + return bfqq; +} + +static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) +{ + if (bfqd->in_service_bic != NULL) { + put_io_context(bfqd->in_service_bic->icq.ioc); + bfqd->in_service_bic = NULL; + } + + bfqd->in_service_queue = NULL; + del_timer(&bfqd->idle_slice_timer); +} + +static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int requeue) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfqq == bfqd->in_service_queue) + __bfq_bfqd_reset_in_service(bfqd); + + bfq_deactivate_entity(entity, requeue); +} + +static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_activate_entity(entity); +} + +/* + * Called when the bfqq no longer has requests pending, remove it from + * the service tree. + */ +static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int requeue) +{ + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + bfq_log_bfqq(bfqd, bfqq, "del from busy"); + + bfq_clear_bfqq_busy(bfqq); + + BUG_ON(bfqd->busy_queues == 0); + bfqd->busy_queues--; + + if (!bfqq->dispatched) { + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->busy_in_flight_queues); + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) { + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + } + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues--; + + bfq_deactivate_bfqq(bfqd, bfqq, requeue); +} + +/* + * Called when an inactive queue receives a new request. + */ +static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqq == bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "add to busy"); + + bfq_activate_bfqq(bfqd, bfqq); + + bfq_mark_bfqq_busy(bfqq); + bfqd->busy_queues++; + + if (!bfqq->dispatched) { + if (bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + bfqd->busy_in_flight_queues++; + if (bfq_bfqq_constantly_seeky(bfqq)) + bfqd->const_seeky_busy_in_flight_queues++; + } + } + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues++; +} diff -Nur linux-4.2/block/bfq.h linux-4.2-pck/block/bfq.h --- linux-4.2/block/bfq.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/block/bfq.h 2015-09-08 00:48:22.218364355 -0300 @@ -0,0 +1,807 @@ +/* + * BFQ-v7r8 for 4.1.0: data structures and common functions prototypes. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + */ + +#ifndef _BFQ_H +#define _BFQ_H + +#include +#include +#include +#include + +#define BFQ_IOPRIO_CLASSES 3 +#define BFQ_CL_IDLE_TIMEOUT (HZ/5) + +#define BFQ_MIN_WEIGHT 1 +#define BFQ_MAX_WEIGHT 1000 + +#define BFQ_DEFAULT_QUEUE_IOPRIO 4 + +#define BFQ_DEFAULT_GRP_WEIGHT 10 +#define BFQ_DEFAULT_GRP_IOPRIO 0 +#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE + +struct bfq_entity; + +/** + * struct bfq_service_tree - per ioprio_class service tree. + * @active: tree for active entities (i.e., those backlogged). + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). + * @first_idle: idle entity with minimum F_i. + * @last_idle: idle entity with maximum F_i. + * @vtime: scheduler virtual time. + * @wsum: scheduler weight sum; active and idle entities contribute to it. + * + * Each service tree represents a B-WF2Q+ scheduler on its own. Each + * ioprio_class has its own independent scheduler, and so its own + * bfq_service_tree. All the fields are protected by the queue lock + * of the containing bfqd. + */ +struct bfq_service_tree { + struct rb_root active; + struct rb_root idle; + + struct bfq_entity *first_idle; + struct bfq_entity *last_idle; + + u64 vtime; + unsigned long wsum; +}; + +/** + * struct bfq_sched_data - multi-class scheduler. + * @in_service_entity: entity in service. + * @next_in_service: head-of-the-line entity in the scheduler. + * @service_tree: array of service trees, one per ioprio_class. + * + * bfq_sched_data is the basic scheduler queue. It supports three + * ioprio_classes, and can be used either as a toplevel queue or as + * an intermediate queue on a hierarchical setup. + * @next_in_service points to the active entity of the sched_data + * service trees that will be scheduled next. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. + * Requests from higher priority queues are served before all the + * requests from lower priority queues; among requests of the same + * queue requests are served according to B-WF2Q+. + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_sched_data { + struct bfq_entity *in_service_entity; + struct bfq_entity *next_in_service; + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; +}; + +/** + * struct bfq_weight_counter - counter of the number of all active entities + * with a given weight. + * @weight: weight of the entities that this counter refers to. + * @num_active: number of active entities with this weight. + * @weights_node: weights tree member (see bfq_data's @queue_weights_tree + * and @group_weights_tree). + */ +struct bfq_weight_counter { + short int weight; + unsigned int num_active; + struct rb_node weights_node; +}; + +/** + * struct bfq_entity - schedulable entity. + * @rb_node: service_tree member. + * @weight_counter: pointer to the weight counter associated with this entity. + * @on_st: flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree). + * @finish: B-WF2Q+ finish timestamp (aka F_i). + * @start: B-WF2Q+ start timestamp (aka S_i). + * @tree: tree the entity is enqueued into; %NULL if not on a tree. + * @min_start: minimum start time of the (active) subtree rooted at + * this entity; used for O(log N) lookups into active trees. + * @service: service received during the last round of service. + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. + * @weight: weight of the queue + * @parent: parent entity, for hierarchical scheduling. + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the + * associated scheduler queue, %NULL on leaf nodes. + * @sched_data: the scheduler queue this entity belongs to. + * @ioprio: the ioprio in use. + * @new_weight: when a weight change is requested, the new weight value. + * @orig_weight: original weight, used to implement weight boosting + * @new_ioprio: when an ioprio change is requested, the new ioprio value. + * @ioprio_class: the ioprio_class in use. + * @new_ioprio_class: when an ioprio_class change is requested, the new + * ioprio_class value. + * @ioprio_changed: flag, true when the user requested a weight, ioprio or + * ioprio_class change. + * + * A bfq_entity is used to represent either a bfq_queue (leaf node in the + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each + * entity belongs to the sched_data of the parent group in the cgroup + * hierarchy. Non-leaf entities have also their own sched_data, stored + * in @my_sched_data. + * + * Each entity stores independently its priority values; this would + * allow different weights on different devices, but this + * functionality is not exported to userspace by now. Priorities and + * weights are updated lazily, first storing the new values into the + * new_* fields, then setting the @ioprio_changed flag. As soon as + * there is a transition in the entity state that allows the priority + * update to take place the effective and the requested priority + * values are synchronized. + * + * Unless cgroups are used, the weight value is calculated from the + * ioprio to export the same interface as CFQ. When dealing with + * ``well-behaved'' queues (i.e., queues that do not spend too much + * time to consume their budget and have true sequential behavior, and + * when there are no external factors breaking anticipation) the + * relative weights at each level of the cgroups hierarchy should be + * guaranteed. All the fields are protected by the queue lock of the + * containing bfqd. + */ +struct bfq_entity { + struct rb_node rb_node; + struct bfq_weight_counter *weight_counter; + + int on_st; + + u64 finish; + u64 start; + + struct rb_root *tree; + + u64 min_start; + + unsigned long service, budget; + unsigned short weight, new_weight; + unsigned short orig_weight; + + struct bfq_entity *parent; + + struct bfq_sched_data *my_sched_data; + struct bfq_sched_data *sched_data; + + unsigned short ioprio, new_ioprio; + unsigned short ioprio_class, new_ioprio_class; + + int ioprio_changed; +}; + +struct bfq_group; + +/** + * struct bfq_queue - leaf schedulable entity. + * @ref: reference counter. + * @bfqd: parent bfq_data. + * @new_bfqq: shared bfq_queue if queue is cooperating with + * one or more other queues. + * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). + * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). + * @sort_list: sorted list of pending requests. + * @next_rq: if fifo isn't expired, next request to serve. + * @queued: nr of requests queued in @sort_list. + * @allocated: currently allocated requests. + * @meta_pending: pending metadata requests. + * @fifo: fifo list of requests in sort_list. + * @entity: entity representing this queue in the scheduler. + * @max_budget: maximum budget allowed from the feedback mechanism. + * @budget_timeout: budget expiration (in jiffies). + * @dispatched: number of requests on the dispatch list or inside driver. + * @flags: status flags. + * @bfqq_list: node for active/idle bfqq list inside our bfqd. + * @burst_list_node: node for the device's burst list. + * @seek_samples: number of seeks sampled + * @seek_total: sum of the distances of the seeks sampled + * @seek_mean: mean seek distance + * @last_request_pos: position of the last request enqueued + * @requests_within_timer: number of consecutive pairs of request completion + * and arrival, such that the queue becomes idle + * after the completion, but the next request arrives + * within an idle time slice; used only if the queue's + * IO_bound has been cleared. + * @pid: pid of the process owning the queue, used for logging purposes. + * @last_wr_start_finish: start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period + * @wr_cur_max_time: current max raising time for this queue + * @soft_rt_next_start: minimum time instant such that, only if a new + * request is enqueued after this time instant in an + * idle @bfq_queue with no outstanding requests, then + * the task associated with the queue it is deemed as + * soft real-time (see the comments to the function + * bfq_bfqq_softrt_next_start()) + * @last_idle_bklogged: time of the last transition of the @bfq_queue from + * idle to backlogged + * @service_from_backlogged: cumulative service received from the @bfq_queue + * since the last transition from idle to + * backlogged + * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the + * queue is shared + * + * A bfq_queue is a leaf request queue; it can be associated with an + * io_context or more, if it is async or shared between cooperating + * processes. @cgroup holds a reference to the cgroup, to be sure that it + * does not disappear while a bfqq still references it (mostly to avoid + * races between request issuing and task migration followed by cgroup + * destruction). + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_queue { + atomic_t ref; + struct bfq_data *bfqd; + + /* fields for cooperating queues handling */ + struct bfq_queue *new_bfqq; + struct rb_node pos_node; + struct rb_root *pos_root; + + struct rb_root sort_list; + struct request *next_rq; + int queued[2]; + int allocated[2]; + int meta_pending; + struct list_head fifo; + + struct bfq_entity entity; + + unsigned long max_budget; + unsigned long budget_timeout; + + int dispatched; + + unsigned int flags; + + struct list_head bfqq_list; + + struct hlist_node burst_list_node; + + unsigned int seek_samples; + u64 seek_total; + sector_t seek_mean; + sector_t last_request_pos; + + unsigned int requests_within_timer; + + pid_t pid; + struct bfq_io_cq *bic; + + /* weight-raising fields */ + unsigned long wr_cur_max_time; + unsigned long soft_rt_next_start; + unsigned long last_wr_start_finish; + unsigned int wr_coeff; + unsigned long last_idle_bklogged; + unsigned long service_from_backlogged; +}; + +/** + * struct bfq_ttime - per process thinktime stats. + * @ttime_total: total process thinktime + * @ttime_samples: number of thinktime samples + * @ttime_mean: average process thinktime + */ +struct bfq_ttime { + unsigned long last_end_request; + + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; +}; + +/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + * @icq: associated io_cq structure + * @bfqq: array of two process queues, the sync and the async + * @ttime: associated @bfq_ttime struct + * @wr_time_left: snapshot of the time left before weight raising ends + * for the sync queue associated to this process; this + * snapshot is taken to remember this value while the weight + * raising is suspended because the queue is merged with a + * shared queue, and is used to set @raising_cur_max_time + * when the queue is split from the shared queue and its + * weight is raised again + * @saved_idle_window: same purpose as the previous field for the idle + * window + * @saved_IO_bound: same purpose as the previous two fields for the I/O + * bound classification of a queue + * @saved_in_large_burst: same purpose as the previous fields for the + * value of the field keeping the queue's belonging + * to a large burst + * @was_in_burst_list: true if the queue belonged to a burst list + * before its merge with another cooperating queue + * @cooperations: counter of consecutive successful queue merges underwent + * by any of the process' @bfq_queues + * @failed_cooperations: counter of consecutive failed queue merges of any + * of the process' @bfq_queues + */ +struct bfq_io_cq { + struct io_cq icq; /* must be the first member */ + struct bfq_queue *bfqq[2]; + struct bfq_ttime ttime; + int ioprio; + + unsigned int wr_time_left; + bool saved_idle_window; + bool saved_IO_bound; + + bool saved_in_large_burst; + bool was_in_burst_list; + + unsigned int cooperations; + unsigned int failed_cooperations; +}; + +enum bfq_device_speed { + BFQ_BFQD_FAST, + BFQ_BFQD_SLOW, +}; + +/** + * struct bfq_data - per device data structure. + * @queue: request queue for the managed device. + * @root_group: root bfq_group for the device. + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_close_cooperator()). + * @active_numerous_groups: number of bfq_groups containing more than one + * active @bfq_entity. + * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by + * weight. Used to keep track of whether all @bfq_queues + * have the same weight. The tree contains one counter + * for each distinct weight associated to some active + * and not weight-raised @bfq_queue (see the comments to + * the functions bfq_weights_tree_[add|remove] for + * further details). + * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted + * by weight. Used to keep track of whether all + * @bfq_groups have the same weight. The tree contains + * one counter for each distinct weight associated to + * some active @bfq_group (see the comments to the + * functions bfq_weights_tree_[add|remove] for further + * details). + * @busy_queues: number of bfq_queues containing requests (including the + * queue in service, even if it is idling). + * @busy_in_flight_queues: number of @bfq_queues containing pending or + * in-flight requests, plus the @bfq_queue in + * service, even if idle but waiting for the + * possible arrival of its next sync request. This + * field is updated only if the device is rotational, + * but used only if the device is also NCQ-capable. + * The reason why the field is updated also for non- + * NCQ-capable rotational devices is related to the + * fact that the value of @hw_tag may be set also + * later than when busy_in_flight_queues may need to + * be incremented for the first time(s). Taking also + * this possibility into account, to avoid unbalanced + * increments/decrements, would imply more overhead + * than just updating busy_in_flight_queues + * regardless of the value of @hw_tag. + * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues + * (that is, seeky queues that expired + * for budget timeout at least once) + * containing pending or in-flight + * requests, including the in-service + * @bfq_queue if constantly seeky. This + * field is updated only if the device + * is rotational, but used only if the + * device is also NCQ-capable (see the + * comments to @busy_in_flight_queues). + * @wr_busy_queues: number of weight-raised busy @bfq_queues. + * @queued: number of queued requests. + * @rq_in_driver: number of requests dispatched and waiting for completion. + * @sync_flight: number of sync requests in the driver. + * @max_rq_in_driver: max number of reqs in driver in the last + * @hw_tag_samples completed requests. + * @hw_tag_samples: nr of samples used to calculate hw_tag. + * @hw_tag: flag set to one if the driver is showing a queueing behavior. + * @budgets_assigned: number of budgets assigned. + * @idle_slice_timer: timer set when idling for the next sequential request + * from the queue in service. + * @unplug_work: delayed work to restart dispatching on the request queue. + * @in_service_queue: bfq_queue in service. + * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. + * @last_position: on-disk position of the last served request. + * @last_budget_start: beginning of the last budget. + * @last_idling_start: beginning of the last idle slice. + * @peak_rate: peak transfer rate observed for a budget. + * @peak_rate_samples: number of samples used to calculate @peak_rate. + * @bfq_max_budget: maximum budget allotted to a bfq_queue before + * rescheduling. + * @group_list: list of all the bfq_groups active on the device. + * @active_list: list of all the bfq_queues active on the device. + * @idle_list: list of all the bfq_queues idle on the device. + * @bfq_fifo_expire: timeout for async/sync requests; when it expires + * requests are served in fifo order. + * @bfq_back_penalty: weight of backward seeks wrt forward ones. + * @bfq_back_max: maximum allowed backward seek. + * @bfq_slice_idle: maximum idling time. + * @bfq_user_max_budget: user-configured max budget value + * (0 for auto-tuning). + * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to + * async queues. + * @bfq_timeout: timeout for bfq_queues to consume their budget; used to + * to prevent seeky queues to impose long latencies to well + * behaved ones (this also implies that seeky queues cannot + * receive guarantees in the service domain; after a timeout + * they are charged for the whole allocated budget, to try + * to preserve a behavior reasonably fair among them, but + * without service-domain guarantees). + * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is + * no more granted any weight-raising. + * @bfq_failed_cooperations: number of consecutive failed cooperation + * chances after which weight-raising is restored + * to a queue subject to more than bfq_coop_thresh + * queue merges. + * @bfq_requests_within_timer: number of consecutive requests that must be + * issued within the idle time slice to set + * again idling to a queue which was marked as + * non-I/O-bound (see the definition of the + * IO_bound flag for further details). + * @last_ins_in_burst: last time at which a queue entered the current + * burst of queues being activated shortly after + * each other; for more details about this and the + * following parameters related to a burst of + * activations, see the comments to the function + * @bfq_handle_burst. + * @bfq_burst_interval: reference time interval used to decide whether a + * queue has been activated shortly after + * @last_ins_in_burst. + * @burst_size: number of queues in the current burst of queue activations. + * @bfq_large_burst_thresh: maximum burst size above which the current + * queue-activation burst is deemed as 'large'. + * @large_burst: true if a large queue-activation burst is in progress. + * @burst_list: head of the burst list (as for the above fields, more details + * in the comments to the function bfq_handle_burst). + * @low_latency: if set to true, low-latency heuristics are enabled. + * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised + * queue is multiplied. + * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). + * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. + * @bfq_wr_min_idle_time: minimum idle period after which weight-raising + * may be reactivated for a queue (in jiffies). + * @bfq_wr_min_inter_arr_async: minimum period between request arrivals + * after which weight-raising may be + * reactivated for an already busy queue + * (in jiffies). + * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, + * sectors per seconds. + * @RT_prod: cached value of the product R*T used for computing the maximum + * duration of the weight raising automatically. + * @device_speed: device-speed class for the low-latency heuristic. + * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. + * + * All the fields are protected by the @queue lock. + */ +struct bfq_data { + struct request_queue *queue; + + struct bfq_group *root_group; + struct rb_root rq_pos_tree; + +#ifdef CONFIG_CGROUP_BFQIO + int active_numerous_groups; +#endif + + struct rb_root queue_weights_tree; + struct rb_root group_weights_tree; + + int busy_queues; + int busy_in_flight_queues; + int const_seeky_busy_in_flight_queues; + int wr_busy_queues; + int queued; + int rq_in_driver; + int sync_flight; + + int max_rq_in_driver; + int hw_tag_samples; + int hw_tag; + + int budgets_assigned; + + struct timer_list idle_slice_timer; + struct work_struct unplug_work; + + struct bfq_queue *in_service_queue; + struct bfq_io_cq *in_service_bic; + + sector_t last_position; + + ktime_t last_budget_start; + ktime_t last_idling_start; + int peak_rate_samples; + u64 peak_rate; + unsigned long bfq_max_budget; + + struct hlist_head group_list; + struct list_head active_list; + struct list_head idle_list; + + unsigned int bfq_fifo_expire[2]; + unsigned int bfq_back_penalty; + unsigned int bfq_back_max; + unsigned int bfq_slice_idle; + u64 bfq_class_idle_last_service; + + unsigned int bfq_user_max_budget; + unsigned int bfq_max_budget_async_rq; + unsigned int bfq_timeout[2]; + + unsigned int bfq_coop_thresh; + unsigned int bfq_failed_cooperations; + unsigned int bfq_requests_within_timer; + + unsigned long last_ins_in_burst; + unsigned long bfq_burst_interval; + int burst_size; + unsigned long bfq_large_burst_thresh; + bool large_burst; + struct hlist_head burst_list; + + bool low_latency; + + /* parameters of the low_latency heuristics */ + unsigned int bfq_wr_coeff; + unsigned int bfq_wr_max_time; + unsigned int bfq_wr_rt_max_time; + unsigned int bfq_wr_min_idle_time; + unsigned long bfq_wr_min_inter_arr_async; + unsigned int bfq_wr_max_softrt_rate; + u64 RT_prod; + enum bfq_device_speed device_speed; + + struct bfq_queue oom_bfqq; +}; + +enum bfqq_state_flags { + BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ + BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ + BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ + BFQ_BFQQ_FLAG_IO_bound, /* + * bfqq has timed-out at least once + * having consumed at most 2/10 of + * its budget + */ + BFQ_BFQQ_FLAG_in_large_burst, /* + * bfqq activated in a large burst, + * see comments to bfq_handle_burst. + */ + BFQ_BFQQ_FLAG_constantly_seeky, /* + * bfqq has proved to be slow and + * seeky until budget timeout + */ + BFQ_BFQQ_FLAG_softrt_update, /* + * may need softrt-next-start + * update + */ + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ + BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ +}; + +#define BFQ_BFQQ_FNS(name) \ +static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ +{ \ + return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ +} + +BFQ_BFQQ_FNS(busy); +BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(must_alloc); +BFQ_BFQQ_FNS(fifo_expire); +BFQ_BFQQ_FNS(idle_window); +BFQ_BFQQ_FNS(sync); +BFQ_BFQQ_FNS(budget_new); +BFQ_BFQQ_FNS(IO_bound); +BFQ_BFQQ_FNS(in_large_burst); +BFQ_BFQQ_FNS(constantly_seeky); +BFQ_BFQQ_FNS(coop); +BFQ_BFQQ_FNS(split_coop); +BFQ_BFQQ_FNS(just_split); +BFQ_BFQQ_FNS(softrt_update); +#undef BFQ_BFQQ_FNS + +/* Logging facilities. */ +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) + +#define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) + +/* Expiration reasons. */ +enum bfqq_expiration { + BFQ_BFQQ_TOO_IDLE = 0, /* + * queue has been idling for + * too long + */ + BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ + BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ + BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ +}; + +#ifdef CONFIG_CGROUP_BFQIO +/** + * struct bfq_group - per (device, cgroup) data structure. + * @entity: schedulable entity to insert into the parent group sched_data. + * @sched_data: own sched_data, to contain child entities (they may be + * both bfq_queues and bfq_groups). + * @group_node: node to be inserted into the bfqio_cgroup->group_data + * list of the containing cgroup's bfqio_cgroup. + * @bfqd_node: node to be inserted into the @bfqd->group_list list + * of the groups active on the same device; used for cleanup. + * @bfqd: the bfq_data for the device this group acts upon. + * @async_bfqq: array of async queues for all the tasks belonging to + * the group, one queue per ioprio value per ioprio_class, + * except for the idle class that has only one queue. + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). + * @my_entity: pointer to @entity, %NULL for the toplevel group; used + * to avoid too many special cases during group creation/ + * migration. + * @active_entities: number of active entities belonging to the group; + * unused for the root group. Used to know whether there + * are groups with more than one active @bfq_entity + * (see the comments to the function + * bfq_bfqq_must_not_expire()). + * + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup + * there is a set of bfq_groups, each one collecting the lower-level + * entities belonging to the group that are acting on the same device. + * + * Locking works as follows: + * o @group_node is protected by the bfqio_cgroup lock, and is accessed + * via RCU from its readers. + * o @bfqd is protected by the queue lock, RCU is used to access it + * from the readers. + * o All the other fields are protected by the @bfqd queue lock. + */ +struct bfq_group { + struct bfq_entity entity; + struct bfq_sched_data sched_data; + + struct hlist_node group_node; + struct hlist_node bfqd_node; + + void *bfqd; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; + + struct bfq_entity *my_entity; + + int active_entities; +}; + +/** + * struct bfqio_cgroup - bfq cgroup data structure. + * @css: subsystem state for bfq in the containing cgroup. + * @online: flag marked when the subsystem is inserted. + * @weight: cgroup weight. + * @ioprio: cgroup ioprio. + * @ioprio_class: cgroup ioprio_class. + * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. + * @group_data: list containing the bfq_group belonging to this cgroup. + * + * @group_data is accessed using RCU, with @lock protecting the updates, + * @ioprio and @ioprio_class are protected by @lock. + */ +struct bfqio_cgroup { + struct cgroup_subsys_state css; + bool online; + + unsigned short weight, ioprio, ioprio_class; + + spinlock_t lock; + struct hlist_head group_data; +}; +#else +struct bfq_group { + struct bfq_sched_data sched_data; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; +}; +#endif + +static inline struct bfq_service_tree * +bfq_entity_service_tree(struct bfq_entity *entity) +{ + struct bfq_sched_data *sched_data = entity->sched_data; + unsigned int idx = entity->ioprio_class - 1; + + BUG_ON(idx >= BFQ_IOPRIO_CLASSES); + BUG_ON(sched_data == NULL); + + return sched_data->service_tree + idx; +} + +static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, + bool is_sync) +{ + return bic->bfqq[is_sync]; +} + +static inline void bic_set_bfqq(struct bfq_io_cq *bic, + struct bfq_queue *bfqq, bool is_sync) +{ + bic->bfqq[is_sync] = bfqq; +} + +static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) +{ + return bic->icq.q->elevator->elevator_data; +} + +/** + * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. + * @ptr: a pointer to a bfqd. + * @flags: storage for the flags to be saved. + * + * This function allows bfqg->bfqd to be protected by the + * queue lock of the bfqd they reference; the pointer is dereferenced + * under RCU, so the storage for bfqd is assured to be safe as long + * as the RCU read side critical section does not end. After the + * bfqd->queue->queue_lock is taken the pointer is rechecked, to be + * sure that no other writer accessed it. If we raced with a writer, + * the function returns NULL, with the queue unlocked, otherwise it + * returns the dereferenced pointer, with the queue locked. + */ +static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, + unsigned long *flags) +{ + struct bfq_data *bfqd; + + rcu_read_lock(); + bfqd = rcu_dereference(*(struct bfq_data **)ptr); + + if (bfqd != NULL) { + spin_lock_irqsave(bfqd->queue->queue_lock, *flags); + if (*ptr == bfqd) + goto out; + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); + } + + bfqd = NULL; +out: + rcu_read_unlock(); + return bfqd; +} + +static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, + unsigned long *flags) +{ + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); +} + +static void bfq_check_ioprio_change(struct bfq_io_cq *bic); +static void bfq_put_queue(struct bfq_queue *bfqq); +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, int is_sync, + struct bfq_io_cq *bic, gfp_t gfp_mask); +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg); +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); + +#endif /* _BFQ_H */ diff -Nur linux-4.2/drivers/accessibility/braille/braille_console.c linux-4.2-pck/drivers/accessibility/braille/braille_console.c --- linux-4.2/drivers/accessibility/braille/braille_console.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/accessibility/braille/braille_console.c 2015-09-08 00:48:22.218364355 -0300 @@ -116,7 +116,7 @@ *c++ = csum; *c++ = ETX; - braille_co->write(braille_co, data, c - data); + braille_co->write(braille_co, data, c - data, 0); } /* Follow the VC cursor*/ diff -Nur linux-4.2/drivers/acpi/blacklist.c linux-4.2-pck/drivers/acpi/blacklist.c --- linux-4.2/drivers/acpi/blacklist.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/acpi/blacklist.c 2015-09-08 00:48:22.218364355 -0300 @@ -317,6 +317,37 @@ }, /* + * The following Lenovo models have a broken workaround in the + * acpi_video backlight implementation to meet the Windows 8 + * requirement of 101 backlight levels. Reverting to pre-Win8 + * behavoir fixes the problem. + */ + { + .callback = dmi_disable_osi_win8, + .ident = "Lenovo ThinkPad T430", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T430"), + }, + }, + { + .callback = dmi_disable_osi_win8, + .ident = "Lenovo ThinkPad T430s", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T430s"), + }, + }, + { + .callback = dmi_disable_osi_win8, + .ident = "Lenovo ThinkPad X230", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad X230"), + }, + }, + + /* * BIOS invocation of _OSI(Linux) is almost always a BIOS bug. * Linux ignores it, except for the machines enumerated below. */ diff -Nur linux-4.2/drivers/ata/acard-ahci.c linux-4.2-pck/drivers/ata/acard-ahci.c --- linux-4.2/drivers/ata/acard-ahci.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/acard-ahci.c 2015-09-08 00:48:22.218364355 -0300 @@ -478,6 +478,9 @@ ata_port_pbar_desc(ap, AHCI_PCI_BAR, 0x100 + ap->port_no * 0x80, "port"); + rc = ahci_setup_port_privdata(ap); + if (rc) + return rc; /* set initial link pm policy */ /* ap->pm_policy = NOT_AVAILABLE; diff -Nur linux-4.2/drivers/ata/ahci.c linux-4.2-pck/drivers/ata/ahci.c --- linux-4.2/drivers/ata/ahci.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/ahci.c 2015-09-08 00:48:22.218364355 -0300 @@ -1597,6 +1597,9 @@ if (ap->flags & ATA_FLAG_EM) ap->em_message_type = hpriv->em_msg_type; + rc = ahci_setup_port_privdata(ap); + if (rc) + return rc; /* disabled/not-implemented port */ if (!(hpriv->port_map & (1 << i))) diff -Nur linux-4.2/drivers/ata/ahci.h linux-4.2-pck/drivers/ata/ahci.h --- linux-4.2/drivers/ata/ahci.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/ahci.h 2015-09-08 00:48:22.218364355 -0300 @@ -315,6 +315,12 @@ /* enclosure management info per PM slot */ struct ahci_em_priv em_priv[EM_MAX_SLOTS]; char *irq_desc; /* desc in /proc/interrupts */ + bool init_alpe; /* alpe enabled by default */ + bool init_asp; /* asp enabled by default */ + bool init_devslp; /* devslp enabled by default */ + u32 init_dito; /* initial dito configuration */ + u32 init_deto; /* initial deto configuration */ + u32 init_mdat; /* initial mdat configuration */ }; struct ahci_host_priv { @@ -374,6 +380,7 @@ extern struct ata_port_operations ahci_pmp_retry_srst_ops; unsigned int ahci_dev_classify(struct ata_port *ap); +int ahci_setup_port_privdata(struct ata_port *ap); void ahci_fill_cmd_slot(struct ahci_port_priv *pp, unsigned int tag, u32 opts); void ahci_save_initial_config(struct device *dev, diff -Nur linux-4.2/drivers/ata/libahci.c linux-4.2-pck/drivers/ata/libahci.c --- linux-4.2/drivers/ata/libahci.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libahci.c 2015-09-08 00:48:22.218364355 -0300 @@ -684,6 +684,7 @@ { struct ata_port *ap = link->ap; struct ahci_host_priv *hpriv = ap->host->private_data; + struct ahci_port_priv *ppriv = ap->private_data; struct ahci_port_priv *pp = ap->private_data; void __iomem *port_mmio = ahci_port_base(ap); @@ -701,9 +702,9 @@ if (hpriv->cap & HOST_CAP_ALPM) { u32 cmd = readl(port_mmio + PORT_CMD); + cmd &= ~(PORT_CMD_ASP | PORT_CMD_ALPE); if (policy == ATA_LPM_MAX_POWER || !(hints & ATA_LPM_HIPM)) { - cmd &= ~(PORT_CMD_ASP | PORT_CMD_ALPE); cmd |= PORT_CMD_ICC_ACTIVE; writel(cmd, port_mmio + PORT_CMD); @@ -711,6 +712,13 @@ /* wait 10ms to be sure we've come out of LPM state */ ata_msleep(ap, 10); + } else if (policy == ATA_LPM_FIRMWARE_DEFAULTS) { + if (ppriv->init_alpe) + cmd |= PORT_CMD_ALPE; + if (ppriv->init_asp) + cmd |= PORT_CMD_ASP; + + writel(cmd, port_mmio + PORT_CMD); } else { cmd |= PORT_CMD_ALPE; if (policy == ATA_LPM_MIN_POWER) @@ -725,10 +733,17 @@ if ((hpriv->cap2 & HOST_CAP2_SDS) && (hpriv->cap2 & HOST_CAP2_SADM) && (link->device->flags & ATA_DFLAG_DEVSLP)) { - if (policy == ATA_LPM_MIN_POWER) + switch (policy) { + case ATA_LPM_MIN_POWER: ahci_set_aggressive_devslp(ap, true); - else + break; + case ATA_LPM_FIRMWARE_DEFAULTS: + ahci_set_aggressive_devslp(ap, ppriv->init_devslp); + break; + default: ahci_set_aggressive_devslp(ap, false); + break; + } } if (policy == ATA_LPM_MAX_POWER) { @@ -2040,6 +2055,7 @@ static void ahci_set_aggressive_devslp(struct ata_port *ap, bool sleep) { struct ahci_host_priv *hpriv = ap->host->private_data; + struct ahci_port_priv *ppriv = ap->private_data; void __iomem *port_mmio = ahci_port_base(ap); struct ata_device *dev = ap->link.device; u32 devslp, dm, dito, mdat, deto; @@ -2075,26 +2091,32 @@ if (rc) return; - dm = (devslp & PORT_DEVSLP_DM_MASK) >> PORT_DEVSLP_DM_OFFSET; - dito = devslp_idle_timeout / (dm + 1); - if (dito > 0x3ff) - dito = 0x3ff; - - /* Use the nominal value 10 ms if the read MDAT is zero, - * the nominal value of DETO is 20 ms. - */ - if (dev->devslp_timing[ATA_LOG_DEVSLP_VALID] & - ATA_LOG_DEVSLP_VALID_MASK) { - mdat = dev->devslp_timing[ATA_LOG_DEVSLP_MDAT] & - ATA_LOG_DEVSLP_MDAT_MASK; - if (!mdat) + if (ppriv->init_devslp) { + dito = ppriv->init_dito; + deto = ppriv->init_deto; + mdat = ppriv->init_mdat; + } else { + dm = (devslp & PORT_DEVSLP_DM_MASK) >> PORT_DEVSLP_DM_OFFSET; + dito = devslp_idle_timeout / (dm + 1); + if (dito > 0x3ff) + dito = 0x3ff; + + /* Use the nominal value 10 ms if the read MDAT is zero, + * the nominal value of DETO is 20 ms. + */ + if (dev->devslp_timing[ATA_LOG_DEVSLP_VALID] & + ATA_LOG_DEVSLP_VALID_MASK) { + mdat = dev->devslp_timing[ATA_LOG_DEVSLP_MDAT] & + ATA_LOG_DEVSLP_MDAT_MASK; + if (!mdat) + mdat = 10; + deto = dev->devslp_timing[ATA_LOG_DEVSLP_DETO]; + if (!deto) + deto = 20; + } else { mdat = 10; - deto = dev->devslp_timing[ATA_LOG_DEVSLP_DETO]; - if (!deto) deto = 20; - } else { - mdat = 10; - deto = 20; + } } devslp |= ((dito << PORT_DEVSLP_DITO_OFFSET) | @@ -2257,19 +2279,53 @@ } #endif +/* + * Allocate port privdata and read back initial power management configuration + */ +int ahci_setup_port_privdata(struct ata_port *ap) +{ + struct ahci_port_priv *pp; + u32 cmd, devslp; + void __iomem *port_mmio = ahci_port_base(ap); + + pp = kzalloc(sizeof(*pp), GFP_KERNEL); + if (!pp) + return -ENOMEM; + + ap->private_data = pp; + + cmd = readl(port_mmio + PORT_CMD); + + if (cmd & PORT_CMD_ALPE) + pp->init_alpe = true; + + if (cmd & PORT_CMD_ASP) + pp->init_asp = true; + + devslp = readl(port_mmio + PORT_DEVSLP); + + /* devslp unsupported or disabled */ + if (!(devslp & PORT_DEVSLP_DSP) || !(devslp & PORT_DEVSLP_ADSE)) + return 0; + + pp->init_devslp = true; + pp->init_dito = (devslp >> PORT_DEVSLP_DITO_OFFSET) & 0x3ff; + pp->init_deto = (devslp >> PORT_DEVSLP_DETO_OFFSET) & 0xff; + pp->init_mdat = (devslp >> PORT_DEVSLP_MDAT_OFFSET) & 0x1f; + + return 0; +} +EXPORT_SYMBOL_GPL(ahci_setup_port_privdata); + static int ahci_port_start(struct ata_port *ap) { struct ahci_host_priv *hpriv = ap->host->private_data; + struct ahci_port_priv *pp = ap->private_data; struct device *dev = ap->host->dev; - struct ahci_port_priv *pp; void *mem; dma_addr_t mem_dma; size_t dma_sz, rx_fis_sz; - pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL); - if (!pp) - return -ENOMEM; - if (ap->host->n_ports > 1) { pp->irq_desc = devm_kzalloc(dev, 8, GFP_KERNEL); if (!pp->irq_desc) { @@ -2348,8 +2404,6 @@ ap->lock = &pp->lock; } - ap->private_data = pp; - /* engage engines, captain */ return ahci_port_resume(ap); } diff -Nur linux-4.2/drivers/ata/libahci_platform.c linux-4.2-pck/drivers/ata/libahci_platform.c --- linux-4.2/drivers/ata/libahci_platform.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libahci_platform.c 2015-09-08 00:48:22.218364355 -0300 @@ -565,6 +565,10 @@ if (ap->flags & ATA_FLAG_EM) ap->em_message_type = hpriv->em_msg_type; + rc = ahci_setup_port_privdata(ap); + if (rc) + return rc; + /* disabled/not-implemented port */ if (!(hpriv->port_map & (1 << i))) ap->ops = &ata_dummy_port_ops; diff -Nur linux-4.2/drivers/ata/libata-core.c linux-4.2-pck/drivers/ata/libata-core.c --- linux-4.2/drivers/ata/libata-core.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libata-core.c 2015-09-08 00:48:22.221697530 -0300 @@ -2024,6 +2024,9 @@ } } + if (id[79] & (1 << SATA_DIPM)) + dev->init_dipm = true; + *p_class = class; return 0; @@ -3655,6 +3658,11 @@ return rc; switch (policy) { + case ATA_LPM_FIRMWARE_DEFAULTS: + /* use the values we read at probe */ + scontrol &= ~(0x7 << 8); + scontrol |= (link->init_lpm << 8); + break; case ATA_LPM_MAX_POWER: /* disable all LPM transitions */ scontrol |= (0x7 << 8); @@ -5582,11 +5590,11 @@ } /** - * sata_link_init_spd - Initialize link->sata_spd_limit - * @link: Link to configure sata_spd_limit for + * sata_link_init_config - Initialize link->sata_spd_limit and init_lpm + * @link: Link to configure sata_spd_limit and init_lpm for * - * Initialize @link->[hw_]sata_spd_limit to the currently - * configured value. + * Initialize @link->[hw_]sata_spd_limit and @link->init_lpm to the + * currently configured value. * * LOCKING: * Kernel thread context (may sleep). @@ -5594,7 +5602,7 @@ * RETURNS: * 0 on success, -errno on failure. */ -int sata_link_init_spd(struct ata_link *link) +int sata_link_init_config(struct ata_link *link) { u8 spd; int rc; @@ -5611,6 +5619,8 @@ link->sata_spd_limit = link->hw_sata_spd_limit; + link->init_lpm = (link->saved_scontrol >> 8) & 0x7; + return 0; } @@ -6160,9 +6170,9 @@ ap->cbl = ATA_CBL_SATA; /* init sata_spd_limit to the current value */ - sata_link_init_spd(&ap->link); + sata_link_init_config(&ap->link); if (ap->slave_link) - sata_link_init_spd(ap->slave_link); + sata_link_init_config(ap->slave_link); /* print per-port info to dmesg */ xfer_mask = ata_pack_xfermask(ap->pio_mask, ap->mwdma_mask, diff -Nur linux-4.2/drivers/ata/libata-eh.c linux-4.2-pck/drivers/ata/libata-eh.c --- linux-4.2/drivers/ata/libata-eh.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libata-eh.c 2015-09-08 00:48:22.221697530 -0300 @@ -3429,9 +3429,9 @@ return 0; /* - * DIPM is enabled only for MIN_POWER as some devices - * misbehave when the host NACKs transition to SLUMBER. Order - * device and link configurations such that the host always + * DIPM is enabled only for MIN_POWER and FIRMWARE_DEFAULT as some + * devices misbehave when the host NACKs transition to SLUMBER. + * Order device and link configurations such that the host always * allows DIPM requests. */ ata_for_each_dev(dev, link, ENABLED) { @@ -3491,10 +3491,13 @@ if (ap && ap->slave_link) ap->slave_link->lpm_policy = policy; - /* host config updated, enable DIPM if transitioning to MIN_POWER */ + /* host config updated, enable DIPM if transitioning to MIN_POWER or + * FIRMWARE_DEFAULT when enabled by firmware + */ ata_for_each_dev(dev, link, ENABLED) { - if (policy == ATA_LPM_MIN_POWER && !no_dipm && - ata_id_has_dipm(dev->id)) { + if ((policy == ATA_LPM_MIN_POWER && !no_dipm && + ata_id_has_dipm(dev->id)) || + (policy == ATA_LPM_FIRMWARE_DEFAULTS && dev->init_dipm)) { err_mask = ata_dev_set_feature(dev, SETFEATURES_SATA_ENABLE, SATA_DIPM); if (err_mask && err_mask != AC_ERR_DEV) { diff -Nur linux-4.2/drivers/ata/libata-pmp.c linux-4.2-pck/drivers/ata/libata-pmp.c --- linux-4.2/drivers/ata/libata-pmp.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libata-pmp.c 2015-09-08 00:48:22.221697530 -0300 @@ -538,7 +538,7 @@ ap->ops->pmp_attach(ap); ata_for_each_link(tlink, ap, EDGE) - sata_link_init_spd(tlink); + sata_link_init_config(tlink); return 0; diff -Nur linux-4.2/drivers/ata/libata-scsi.c linux-4.2-pck/drivers/ata/libata-scsi.c --- linux-4.2/drivers/ata/libata-scsi.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libata-scsi.c 2015-09-08 00:48:22.221697530 -0300 @@ -107,6 +107,7 @@ static const char *ata_lpm_policy_names[] = { [ATA_LPM_UNKNOWN] = "max_performance", [ATA_LPM_MAX_POWER] = "max_performance", + [ATA_LPM_FIRMWARE_DEFAULTS] = "firmware_defaults", [ATA_LPM_MED_POWER] = "medium_power", [ATA_LPM_MIN_POWER] = "min_power", }; diff -Nur linux-4.2/drivers/ata/libata.h linux-4.2-pck/drivers/ata/libata.h --- linux-4.2/drivers/ata/libata.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libata.h 2015-09-08 00:48:22.221697530 -0300 @@ -98,7 +98,7 @@ extern bool ata_phys_link_offline(struct ata_link *link); extern void ata_dev_init(struct ata_device *dev); extern void ata_link_init(struct ata_port *ap, struct ata_link *link, int pmp); -extern int sata_link_init_spd(struct ata_link *link); +extern int sata_link_init_config(struct ata_link *link); extern int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg); extern int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg); extern struct ata_port *ata_port_alloc(struct ata_host *host); diff -Nur linux-4.2/drivers/ata/sata_highbank.c linux-4.2-pck/drivers/ata/sata_highbank.c --- linux-4.2/drivers/ata/sata_highbank.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/sata_highbank.c 2015-09-08 00:48:22.221697530 -0300 @@ -557,6 +557,10 @@ if (ap->flags & ATA_FLAG_EM) ap->em_message_type = hpriv->em_msg_type; + rc = ahci_setup_port_privdata(ap); + if (rc) + goto err0; + /* disabled/not-implemented port */ if (!(hpriv->port_map & (1 << i))) ap->ops = &ata_dummy_port_ops; diff -Nur linux-4.2/drivers/cpufreq/cpufreq_ondemand.c linux-4.2-pck/drivers/cpufreq/cpufreq_ondemand.c --- linux-4.2/drivers/cpufreq/cpufreq_ondemand.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/cpufreq/cpufreq_ondemand.c 2015-09-08 00:48:22.221697530 -0300 @@ -20,7 +20,11 @@ /* On-demand governor macros */ #define DEF_FREQUENCY_UP_THRESHOLD (80) +#ifdef CONFIG_PCK_INTERACTIVE +#define DEF_SAMPLING_DOWN_FACTOR (10) +#else #define DEF_SAMPLING_DOWN_FACTOR (1) +#endif #define MAX_SAMPLING_DOWN_FACTOR (100000) #define MICRO_FREQUENCY_UP_THRESHOLD (95) #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) diff -Nur linux-4.2/drivers/gpu/drm/radeon/r600.c linux-4.2-pck/drivers/gpu/drm/radeon/r600.c --- linux-4.2/drivers/gpu/drm/radeon/r600.c 2015-08-30 16:21:13.000000000 -0300 +++ linux-4.2-pck/drivers/gpu/drm/radeon/r600.c 2015-09-08 00:49:30.085144720 -0300 @@ -2489,7 +2489,7 @@ } DRM_INFO("Loading %s Microcode\n", chip_name); - +#if 0 snprintf(fw_name, sizeof(fw_name), "/*(DEBLOBBED)*/", chip_name); err = reject_firmware(&rdev->pfp_fw, fw_name, rdev->dev); if (err) @@ -2541,7 +2541,7 @@ err = -EINVAL; } } - +#endif out: if (err) { if (err != -EINVAL) @@ -3201,7 +3201,7 @@ r = radeon_bo_init(rdev); if (r) return r; - +#if 0 if (!rdev->me_fw || !rdev->pfp_fw || !rdev->rlc_fw) { r = r600_init_microcode(rdev); if (r) { @@ -3209,7 +3209,7 @@ return r; } } - +#endif /* Initialize power management */ radeon_pm_init(rdev); diff -Nur linux-4.2/drivers/gpu/drm/radeon/r600_cp.c linux-4.2-pck/drivers/gpu/drm/radeon/r600_cp.c --- linux-4.2/drivers/gpu/drm/radeon/r600_cp.c 2015-08-30 16:21:17.000000000 -0300 +++ linux-4.2-pck/drivers/gpu/drm/radeon/r600_cp.c 2015-09-08 00:49:30.085144720 -0300 @@ -2241,7 +2241,7 @@ else r600_vm_init(dev); } - +#if 0 if (!dev_priv->me_fw || !dev_priv->pfp_fw) { int err = r600_cp_init_microcode(dev_priv); if (err) { @@ -2250,6 +2250,9 @@ return err; } } +#endif + printk("Skipping firmware loading\n"); +#endif if (((dev_priv->flags & RADEON_FAMILY_MASK) >= CHIP_RV770)) r700_cp_load_microcode(dev_priv); else diff -Nur linux-4.2/drivers/input/joydev.c linux-4.2-pck/drivers/input/joydev.c --- linux-4.2/drivers/input/joydev.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/input/joydev.c 2015-09-08 00:48:22.221697530 -0300 @@ -28,15 +28,21 @@ #include #include #include +#include +#include MODULE_AUTHOR("Vojtech Pavlik "); MODULE_DESCRIPTION("Joystick device interfaces"); MODULE_SUPPORTED_DEVICE("input/js"); MODULE_LICENSE("GPL"); - #define JOYDEV_MINOR_BASE 0 #define JOYDEV_MINORS 16 #define JOYDEV_BUFFER_SIZE 64 +#define MAX_REMAP_SIZE 10 + +static int remap_array[MAX_REMAP_SIZE]; +static int remap_count = 0; +static int free_buttons[MAX_REMAP_SIZE]; struct joydev { int open; @@ -71,6 +77,9 @@ struct list_head node; }; +module_param_array(remap_array, int, &remap_count, 0 ); +MODULE_PARM_DESC( remap_array, "remap axis to buttons\n" ); + static int joydev_correct(int value, struct js_corr *corr) { switch (corr->type) { @@ -121,6 +130,17 @@ struct joydev *joydev = handle->private; struct joydev_client *client; struct js_event event; + int i; + + if( remap_count > 0 && remap_count < MAX_REMAP_SIZE ){ + for( i = 0; i < remap_count; i++ ) + if( code == remap_array[i] ){ + type = EV_KEY; + code = free_buttons[i]; + if( value == 255 ) + value = 1; + } + } switch (type) { @@ -826,7 +846,7 @@ const struct input_device_id *id) { struct joydev *joydev; - int i, j, t, minor, dev_no; + int i, j = 0, t, minor, dev_no; int error; minor = input_get_new_minor(JOYDEV_MINOR_BASE, JOYDEV_MINORS, true); @@ -871,15 +891,24 @@ joydev->keymap[i] = joydev->nkey; joydev->keypam[joydev->nkey] = i + BTN_MISC; joydev->nkey++; - } + j = i; + } + if( remap_count > 0 && remap_count < MAX_REMAP_SIZE ){ + printk( "[joydev] axis remapping enabled\n" ); + for( i = 0; i < remap_count; i++ ){ + joydev->keymap[j + i + 1] = joydev->nkey; + joydev->keypam[joydev->nkey] = i + j + 1 + BTN_MISC; + free_buttons[i] = j + i + 1 + BTN_MISC; + joydev->nkey++; + } for (i = 0; i < BTN_JOYSTICK - BTN_MISC; i++) if (test_bit(i + BTN_MISC, dev->keybit)) { joydev->keymap[i] = joydev->nkey; joydev->keypam[joydev->nkey] = i + BTN_MISC; joydev->nkey++; } - + } for (i = 0; i < joydev->nabs; i++) { j = joydev->abspam[i]; if (input_abs_get_max(dev, j) == input_abs_get_min(dev, j)) { diff -Nur linux-4.2/drivers/input/mouse/synaptics.c linux-4.2-pck/drivers/input/mouse/synaptics.c --- linux-4.2/drivers/input/mouse/synaptics.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/input/mouse/synaptics.c 2015-09-08 00:48:22.221697530 -0300 @@ -1250,7 +1250,9 @@ /* Clickpads report only left button */ __clear_bit(BTN_RIGHT, dev->keybit); __clear_bit(BTN_MIDDLE, dev->keybit); - } + } else if (SYN_CAP_CLICKPAD2BTN(priv->ext_cap_0c) || + SYN_CAP_CLICKPAD2BTN2(priv->ext_cap_0c)) + __set_bit(INPUT_PROP_BUTTONPAD, dev->propbit); } static ssize_t synaptics_show_disable_gesture(struct psmouse *psmouse, diff -Nur linux-4.2/drivers/input/mouse/synaptics.h linux-4.2-pck/drivers/input/mouse/synaptics.h --- linux-4.2/drivers/input/mouse/synaptics.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/input/mouse/synaptics.h 2015-09-08 00:48:22.221697530 -0300 @@ -85,6 +85,7 @@ */ #define SYN_CAP_CLICKPAD(ex0c) ((ex0c) & 0x100000) /* 1-button ClickPad */ #define SYN_CAP_CLICKPAD2BTN(ex0c) ((ex0c) & 0x000100) /* 2-button ClickPad */ +#define SYN_CAP_CLICKPAD2BTN2(ex0c) ((ex0c) & 0x200000) /* 2-button ClickPad */ #define SYN_CAP_MAX_DIMENSIONS(ex0c) ((ex0c) & 0x020000) #define SYN_CAP_MIN_DIMENSIONS(ex0c) ((ex0c) & 0x002000) #define SYN_CAP_ADV_GESTURE(ex0c) ((ex0c) & 0x080000) diff -Nur linux-4.2/drivers/macintosh/Kconfig linux-4.2-pck/drivers/macintosh/Kconfig --- linux-4.2/drivers/macintosh/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/macintosh/Kconfig 2015-09-08 00:48:22.221697530 -0300 @@ -172,6 +172,13 @@ If unsure, say Y. +config ADB_TRACKPAD_ABSOLUTE + bool "Enable absolute mode for adb trackpads" + depends on INPUT_ADBHID + help + Enable absolute mode in adb-base trackpads. This feature adds + compatibility with synaptics Xorg / Xfree drivers. + config MAC_EMUMOUSEBTN tristate "Support for mouse button 2+3 emulation" depends on SYSCTL && INPUT diff -Nur linux-4.2/drivers/macintosh/adbhid.c linux-4.2-pck/drivers/macintosh/adbhid.c --- linux-4.2/drivers/macintosh/adbhid.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/macintosh/adbhid.c 2015-09-08 00:48:22.221697530 -0300 @@ -261,6 +261,15 @@ #define ADBMOUSE_MS_A3 8 /* Mouse systems A3 trackball (handler 3) */ #define ADBMOUSE_MACALLY2 9 /* MacAlly 2-button mouse */ +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE +#define ABS_XMIN 310 +#define ABS_XMAX 1700 +#define ABS_YMIN 200 +#define ABS_YMAX 1000 +#define ABS_ZMIN 0 +#define ABS_ZMAX 55 +#endif + static void adbhid_keyboard_input(unsigned char *data, int nb, int apoll) { @@ -405,6 +414,9 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) { int id = (data[0] >> 4) & 0x0f; +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + int btn = 0; int x_axis = 0; int y_axis = 0; int z_axis = 0; +#endif if (!adbhid[id]) { printk(KERN_ERR "ADB HID on ID %d not yet registered\n", id); @@ -436,6 +448,17 @@ high bits of y-axis motion. XY is additional high bits of x-axis motion. + For ADB Absolute motion protocol the data array will contain the + following values: + + BITS COMMENTS + data[0] = dddd 1100 ADB command: Talk, register 0, for device dddd. + data[1] = byyy yyyy Left button and y-axis motion. + data[2] = bxxx xxxx Second button and x-axis motion. + data[3] = 1yyy 1xxx Half bits of y-axis and x-axis motion. + data[4] = 1yyy 1xxx Higher bits of y-axis and x-axis motion. + data[5] = 1zzz 1zzz Higher and lower bits of z-pressure. + MacAlly 2-button mouse protocol. For MacAlly 2-button mouse protocol the data array will contain the @@ -458,8 +481,17 @@ switch (adbhid[id]->mouse_kind) { case ADBMOUSE_TRACKPAD: +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + x_axis = (data[2] & 0x7f) | ((data[3] & 0x07) << 7) | + ((data[4] & 0x07) << 10); + y_axis = (data[1] & 0x7f) | ((data[3] & 0x70) << 3) | + ((data[4] & 0x70) << 6); + z_axis = (data[5] & 0x07) | ((data[5] & 0x70) >> 1); + btn = (!(data[1] >> 7)) & 1; +#else data[1] = (data[1] & 0x7f) | ((data[1] & data[2]) & 0x80); data[2] = data[2] | 0x80; +#endif break; case ADBMOUSE_MICROSPEED: data[1] = (data[1] & 0x7f) | ((data[3] & 0x01) << 7); @@ -485,17 +517,39 @@ break; } - input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); - input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + if ( adbhid[id]->mouse_kind == ADBMOUSE_TRACKPAD ) { - if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) - input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); + if(z_axis > 30) input_report_key(adbhid[id]->input, BTN_TOUCH, 1); + if(z_axis < 25) input_report_key(adbhid[id]->input, BTN_TOUCH, 0); - input_report_rel(adbhid[id]->input, REL_X, - ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); - input_report_rel(adbhid[id]->input, REL_Y, - ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); + if(z_axis > 0){ + input_report_abs(adbhid[id]->input, ABS_X, x_axis); + input_report_abs(adbhid[id]->input, ABS_Y, y_axis); + input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 1); + input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 5); + } else { + input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 0); + input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 0); + } + + input_report_abs(adbhid[id]->input, ABS_PRESSURE, z_axis); + input_report_key(adbhid[id]->input, BTN_LEFT, btn); + } else { +#endif + input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); + input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); + + if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) + input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); + input_report_rel(adbhid[id]->input, REL_X, + ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); + input_report_rel(adbhid[id]->input, REL_Y, + ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + } +#endif input_sync(adbhid[id]->input); } @@ -849,6 +903,15 @@ input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + set_bit(EV_ABS, input_dev->evbit); + input_set_abs_params(input_dev, ABS_X, ABS_XMIN, ABS_XMAX, 0, 0); + input_set_abs_params(input_dev, ABS_Y, ABS_YMIN, ABS_YMAX, 0, 0); + input_set_abs_params(input_dev, ABS_PRESSURE, ABS_ZMIN, ABS_ZMAX, 0, 0); + set_bit(BTN_TOUCH, input_dev->keybit); + set_bit(BTN_TOOL_FINGER, input_dev->keybit); + set_bit(ABS_TOOL_WIDTH, input_dev->absbit); +#endif break; case ADB_MISC: @@ -1132,7 +1195,11 @@ r1_buffer[3], r1_buffer[4], r1_buffer[5], +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + 0x00, /* Enable absolute mode */ +#else 0x03, /*r1_buffer[6],*/ +#endif r1_buffer[7]); /* Without this flush, the trackpad may be locked up */ diff -Nur linux-4.2/drivers/net/ethernet/intel/e1000e/netdev.c linux-4.2-pck/drivers/net/ethernet/intel/e1000e/netdev.c --- linux-4.2/drivers/net/ethernet/intel/e1000e/netdev.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/net/ethernet/intel/e1000e/netdev.c 2015-09-08 00:48:22.225030705 -0300 @@ -4280,18 +4280,29 @@ struct e1000_adapter *adapter = container_of(cc, struct e1000_adapter, cc); struct e1000_hw *hw = &adapter->hw; + u32 systimel_1, systimel_2, systimeh; cycle_t systim, systim_next; - /* SYSTIMH latching upon SYSTIML read does not work well. To fix that - * we don't want to allow overflow of SYSTIML and a change to SYSTIMH - * to occur between reads, so if we read a vale close to overflow, we - * wait for overflow to occur and read both registers when its safe. + /* SYSTIMH latching upon SYSTIML read does not work well. + * This means that if SYSTIML overflows after we read it but before + * we read SYSTIMH, the value of SYSTIMH has been incremented and we + * will experience a huge non linear increment in the systime value + * to fix that we test for overflow and if true, we re-read systime. */ - u32 systim_overflow_latch_fix = 0x3FFFFFFF; - - do { - systim = (cycle_t)er32(SYSTIML); - } while (systim > systim_overflow_latch_fix); - systim |= (cycle_t)er32(SYSTIMH) << 32; + systimel_1 = er32(SYSTIML); + systimeh = er32(SYSTIMH); + systimel_2 = er32(SYSTIML); + /* Check for overflow. If there was no overflow, use the values */ + if (systimel_1 < systimel_2) { + systim = (cycle_t)systimel_1; + systim |= (cycle_t)systimeh << 32; + } else { + /* There was an overflow, read again SYSTIMH, and use + * systimel_2 + */ + systimeh = er32(SYSTIMH); + systim = (cycle_t)systimel_2; + systim |= (cycle_t)systimeh << 32; + } if ((hw->mac.type == e1000_82574) || (hw->mac.type == e1000_82583)) { u64 incvalue, time_delta, rem, temp; diff -Nur linux-4.2/drivers/net/netconsole.c linux-4.2-pck/drivers/net/netconsole.c --- linux-4.2/drivers/net/netconsole.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/net/netconsole.c 2015-09-08 00:48:22.225030705 -0300 @@ -836,7 +836,7 @@ } static void write_ext_msg(struct console *con, const char *msg, - unsigned int len) + unsigned int len, unsigned int loglevel) { struct netconsole_target *nt; unsigned long flags; @@ -851,7 +851,8 @@ spin_unlock_irqrestore(&target_list_lock, flags); } -static void write_msg(struct console *con, const char *msg, unsigned int len) +static void write_msg(struct console *con, const char *msg, unsigned int len, + unsigned int loglevel) { int frag, left; unsigned long flags; diff -Nur linux-4.2/drivers/platform/x86/Kconfig linux-4.2-pck/drivers/platform/x86/Kconfig --- linux-4.2/drivers/platform/x86/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/platform/x86/Kconfig 2015-09-08 00:48:22.225030705 -0300 @@ -489,9 +489,30 @@ If you are not sure, say Y here. The driver enables polling only if it is strictly necessary to do so. +config THINKPAD_EC + tristate + depends on X86 + ---help--- + This is a low-level driver for accessing the ThinkPad H8S embedded + controller over the LPC bus (not to be confused with the ACPI Embedded + Controller interface). + +config TP_SMAPI + tristate "ThinkPad SMAPI Support" + depends on X86 + select THINKPAD_EC + default n + help + This adds SMAPI support on Lenovo/IBM ThinkPads, for features such + as battery charging control. For more information about this driver + see . + + If you have a Lenovo/IBM ThinkPad laptop, say Y or M here. + config SENSORS_HDAPS tristate "Thinkpad Hard Drive Active Protection System (hdaps)" depends on INPUT && X86 + select THINKPAD_EC select INPUT_POLLDEV default n help diff -Nur linux-4.2/drivers/platform/x86/Makefile linux-4.2-pck/drivers/platform/x86/Makefile --- linux-4.2/drivers/platform/x86/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/platform/x86/Makefile 2015-09-08 00:48:22.225030705 -0300 @@ -25,6 +25,8 @@ obj-$(CONFIG_SONY_LAPTOP) += sony-laptop.o obj-$(CONFIG_IDEAPAD_LAPTOP) += ideapad-laptop.o obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o +obj-$(CONFIG_THINKPAD_EC) += thinkpad_ec.o +obj-$(CONFIG_TP_SMAPI) += tp_smapi.o obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o obj-$(CONFIG_FUJITSU_LAPTOP) += fujitsu-laptop.o obj-$(CONFIG_FUJITSU_TABLET) += fujitsu-tablet.o diff -Nur linux-4.2/drivers/platform/x86/hdaps.c linux-4.2-pck/drivers/platform/x86/hdaps.c --- linux-4.2/drivers/platform/x86/hdaps.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/platform/x86/hdaps.c 2015-09-08 00:48:22.225030705 -0300 @@ -30,266 +30,384 @@ #include #include -#include +#include #include -#include #include #include #include #include -#include - -#define HDAPS_LOW_PORT 0x1600 /* first port used by hdaps */ -#define HDAPS_NR_PORTS 0x30 /* number of ports: 0x1600 - 0x162f */ - -#define HDAPS_PORT_STATE 0x1611 /* device state */ -#define HDAPS_PORT_YPOS 0x1612 /* y-axis position */ -#define HDAPS_PORT_XPOS 0x1614 /* x-axis position */ -#define HDAPS_PORT_TEMP1 0x1616 /* device temperature, in Celsius */ -#define HDAPS_PORT_YVAR 0x1617 /* y-axis variance (what is this?) */ -#define HDAPS_PORT_XVAR 0x1619 /* x-axis variance (what is this?) */ -#define HDAPS_PORT_TEMP2 0x161b /* device temperature (again?) */ -#define HDAPS_PORT_UNKNOWN 0x161c /* what is this? */ -#define HDAPS_PORT_KMACT 0x161d /* keyboard or mouse activity */ - -#define STATE_FRESH 0x50 /* accelerometer data is fresh */ +#include +#include +#include + +/* Embedded controller accelerometer read command and its result: */ +static const struct thinkpad_ec_row ec_accel_args = + { .mask = 0x0001, .val = {0x11} }; +#define EC_ACCEL_IDX_READOUTS 0x1 /* readouts included in this read */ + /* First readout, if READOUTS>=1: */ +#define EC_ACCEL_IDX_YPOS1 0x2 /* y-axis position word */ +#define EC_ACCEL_IDX_XPOS1 0x4 /* x-axis position word */ +#define EC_ACCEL_IDX_TEMP1 0x6 /* device temperature in Celsius */ + /* Second readout, if READOUTS>=2: */ +#define EC_ACCEL_IDX_XPOS2 0x7 /* y-axis position word */ +#define EC_ACCEL_IDX_YPOS2 0x9 /* x-axis position word */ +#define EC_ACCEL_IDX_TEMP2 0xb /* device temperature in Celsius */ +#define EC_ACCEL_IDX_QUEUED 0xc /* Number of queued readouts left */ +#define EC_ACCEL_IDX_KMACT 0xd /* keyboard or mouse activity */ +#define EC_ACCEL_IDX_RETVAL 0xf /* command return value, good=0x00 */ #define KEYBD_MASK 0x20 /* set if keyboard activity */ #define MOUSE_MASK 0x40 /* set if mouse activity */ -#define KEYBD_ISSET(n) (!! (n & KEYBD_MASK)) /* keyboard used? */ -#define MOUSE_ISSET(n) (!! (n & MOUSE_MASK)) /* mouse used? */ -#define INIT_TIMEOUT_MSECS 4000 /* wait up to 4s for device init ... */ -#define INIT_WAIT_MSECS 200 /* ... in 200ms increments */ +#define READ_TIMEOUT_MSECS 100 /* wait this long for device read */ +#define RETRY_MSECS 3 /* retry delay */ -#define HDAPS_POLL_INTERVAL 50 /* poll for input every 1/20s (50 ms)*/ #define HDAPS_INPUT_FUZZ 4 /* input event threshold */ #define HDAPS_INPUT_FLAT 4 +#define KMACT_REMEMBER_PERIOD (HZ/10) /* keyboard/mouse persistance */ -#define HDAPS_X_AXIS (1 << 0) -#define HDAPS_Y_AXIS (1 << 1) -#define HDAPS_BOTH_AXES (HDAPS_X_AXIS | HDAPS_Y_AXIS) +/* Input IDs */ +#define HDAPS_INPUT_VENDOR PCI_VENDOR_ID_IBM +#define HDAPS_INPUT_PRODUCT 0x5054 /* "TP", shared with thinkpad_acpi */ +#define HDAPS_INPUT_JS_VERSION 0x6801 /* Joystick emulation input device */ +#define HDAPS_INPUT_RAW_VERSION 0x4801 /* Raw accelerometer input device */ + +/* Axis orientation. */ +/* The unnatural bit-representation of inversions is for backward + * compatibility with the"invert=1" module parameter. */ +#define HDAPS_ORIENT_INVERT_XY 0x01 /* Invert both X and Y axes. */ +#define HDAPS_ORIENT_INVERT_X 0x02 /* Invert the X axis (uninvert if + * already inverted by INVERT_XY). */ +#define HDAPS_ORIENT_SWAP 0x04 /* Swap the axes. The swap occurs + * before inverting X or Y. */ +#define HDAPS_ORIENT_MAX 0x07 +#define HDAPS_ORIENT_UNDEFINED 0xFF /* Placeholder during initialization */ +#define HDAPS_ORIENT_INVERT_Y (HDAPS_ORIENT_INVERT_XY | HDAPS_ORIENT_INVERT_X) +static struct timer_list hdaps_timer; static struct platform_device *pdev; -static struct input_polled_dev *hdaps_idev; -static unsigned int hdaps_invert; -static u8 km_activity; -static int rest_x; -static int rest_y; - -static DEFINE_MUTEX(hdaps_mtx); - -/* - * __get_latch - Get the value from a given port. Callers must hold hdaps_mtx. - */ -static inline u8 __get_latch(u16 port) -{ - return inb(port) & 0xff; +static struct input_dev *hdaps_idev; /* joystick-like device with fuzz */ +static struct input_dev *hdaps_idev_raw; /* raw hdaps sensor readouts */ +static unsigned int hdaps_invert = HDAPS_ORIENT_UNDEFINED; +static int needs_calibration; + +/* Configuration: */ +static int sampling_rate = 50; /* Sampling rate */ +static int oversampling_ratio = 5; /* Ratio between our sampling rate and + * EC accelerometer sampling rate */ +static int running_avg_filter_order = 2; /* EC running average filter order */ + +/* Latest state readout: */ +static int pos_x, pos_y; /* position */ +static int temperature; /* temperature */ +static int stale_readout = 1; /* last read invalid */ +static int rest_x, rest_y; /* calibrated rest position */ + +/* Last time we saw keyboard and mouse activity: */ +static u64 last_keyboard_jiffies = INITIAL_JIFFIES; +static u64 last_mouse_jiffies = INITIAL_JIFFIES; +static u64 last_update_jiffies = INITIAL_JIFFIES; + +/* input device use count */ +static int hdaps_users; +static DEFINE_MUTEX(hdaps_users_mtx); + +/* Some models require an axis transformation to the standard representation */ +static void transform_axes(int *x, int *y) +{ + if (hdaps_invert & HDAPS_ORIENT_SWAP) { + int z; + z = *x; + *x = *y; + *y = z; + } + if (hdaps_invert & HDAPS_ORIENT_INVERT_XY) { + *x = -*x; + *y = -*y; + } + if (hdaps_invert & HDAPS_ORIENT_INVERT_X) + *x = -*x; } -/* - * __check_latch - Check a port latch for a given value. Returns zero if the - * port contains the given value. Callers must hold hdaps_mtx. +/** + * __hdaps_update - query current state, with locks already acquired + * @fast: if nonzero, do one quick attempt without retries. + * + * Query current accelerometer state and update global state variables. + * Also prefetches the next query. Caller must hold controller lock. */ -static inline int __check_latch(u16 port, u8 val) +static int __hdaps_update(int fast) { - if (__get_latch(port) == val) - return 0; - return -EINVAL; -} + /* Read data: */ + struct thinkpad_ec_row data; + int ret; -/* - * __wait_latch - Wait up to 100us for a port latch to get a certain value, - * returning zero if the value is obtained. Callers must hold hdaps_mtx. - */ -static int __wait_latch(u16 port, u8 val) -{ - unsigned int i; + data.mask = (1 << EC_ACCEL_IDX_READOUTS) | (1 << EC_ACCEL_IDX_KMACT) | + (3 << EC_ACCEL_IDX_YPOS1) | (3 << EC_ACCEL_IDX_XPOS1) | + (1 << EC_ACCEL_IDX_TEMP1) | (1 << EC_ACCEL_IDX_RETVAL); + if (fast) + ret = thinkpad_ec_try_read_row(&ec_accel_args, &data); + else + ret = thinkpad_ec_read_row(&ec_accel_args, &data); + thinkpad_ec_prefetch_row(&ec_accel_args); /* Prefetch even if error */ + if (ret) + return ret; - for (i = 0; i < 20; i++) { - if (!__check_latch(port, val)) - return 0; - udelay(5); + /* Check status: */ + if (data.val[EC_ACCEL_IDX_RETVAL] != 0x00) { + pr_warn("read RETVAL=0x%02x\n", + data.val[EC_ACCEL_IDX_RETVAL]); + return -EIO; } - return -EIO; + if (data.val[EC_ACCEL_IDX_READOUTS] < 1) + return -EBUSY; /* no pending readout, try again later */ + + /* Parse position data: */ + pos_x = *(s16 *)(data.val+EC_ACCEL_IDX_XPOS1); + pos_y = *(s16 *)(data.val+EC_ACCEL_IDX_YPOS1); + transform_axes(&pos_x, &pos_y); + + /* Keyboard and mouse activity status is cleared as soon as it's read, + * so applications will eat each other's events. Thus we remember any + * event for KMACT_REMEMBER_PERIOD jiffies. + */ + if (data.val[EC_ACCEL_IDX_KMACT] & KEYBD_MASK) + last_keyboard_jiffies = get_jiffies_64(); + if (data.val[EC_ACCEL_IDX_KMACT] & MOUSE_MASK) + last_mouse_jiffies = get_jiffies_64(); + + temperature = data.val[EC_ACCEL_IDX_TEMP1]; + + last_update_jiffies = get_jiffies_64(); + stale_readout = 0; + if (needs_calibration) { + rest_x = pos_x; + rest_y = pos_y; + needs_calibration = 0; + } + + return 0; } -/* - * __device_refresh - request a refresh from the accelerometer. Does not wait - * for refresh to complete. Callers must hold hdaps_mtx. +/** + * hdaps_update - acquire locks and query current state + * + * Query current accelerometer state and update global state variables. + * Also prefetches the next query. + * Retries until timeout if the accelerometer is not in ready status (common). + * Does its own locking. */ -static void __device_refresh(void) +static int hdaps_update(void) { - udelay(200); - if (inb(0x1604) != STATE_FRESH) { - outb(0x11, 0x1610); - outb(0x01, 0x161f); + u64 age = get_jiffies_64() - last_update_jiffies; + int total, ret; + + if (!stale_readout && age < (9*HZ)/(10*sampling_rate)) + return 0; /* already updated recently */ + for (total = 0; total < READ_TIMEOUT_MSECS; total += RETRY_MSECS) { + ret = thinkpad_ec_lock(); + if (ret) + return ret; + ret = __hdaps_update(0); + thinkpad_ec_unlock(); + + if (!ret) + return 0; + if (ret != -EBUSY) + break; + msleep(RETRY_MSECS); } + return ret; } -/* - * __device_refresh_sync - request a synchronous refresh from the - * accelerometer. We wait for the refresh to complete. Returns zero if - * successful and nonzero on error. Callers must hold hdaps_mtx. +/** + * hdaps_set_power - enable or disable power to the accelerometer. + * Returns zero on success and negative error code on failure. Can sleep. */ -static int __device_refresh_sync(void) +static int hdaps_set_power(int on) { - __device_refresh(); - return __wait_latch(0x1604, STATE_FRESH); + struct thinkpad_ec_row args = + { .mask = 0x0003, .val = {0x14, on?0x01:0x00} }; + struct thinkpad_ec_row data = { .mask = 0x8000 }; + int ret = thinkpad_ec_read_row(&args, &data); + if (ret) + return ret; + if (data.val[0xF] != 0x00) + return -EIO; + return 0; } -/* - * __device_complete - indicate to the accelerometer that we are done reading - * data, and then initiate an async refresh. Callers must hold hdaps_mtx. +/** + * hdaps_set_ec_config - set accelerometer parameters. + * @ec_rate: embedded controller sampling rate + * @order: embedded controller running average filter order + * (Normally we have @ec_rate = sampling_rate * oversampling_ratio.) + * Returns zero on success and negative error code on failure. Can sleep. */ -static inline void __device_complete(void) +static int hdaps_set_ec_config(int ec_rate, int order) { - inb(0x161f); - inb(0x1604); - __device_refresh(); + struct thinkpad_ec_row args = { .mask = 0x000F, + .val = {0x10, (u8)ec_rate, (u8)(ec_rate>>8), order} }; + struct thinkpad_ec_row data = { .mask = 0x8000 }; + int ret = thinkpad_ec_read_row(&args, &data); + pr_debug("setting ec_rate=%d, filter_order=%d\n", ec_rate, order); + if (ret) + return ret; + if (data.val[0xF] == 0x03) { + pr_warn("config param out of range\n"); + return -EINVAL; + } + if (data.val[0xF] == 0x06) { + pr_warn("config change already pending\n"); + return -EBUSY; + } + if (data.val[0xF] != 0x00) { + pr_warn("config change error, ret=%d\n", + data.val[0xF]); + return -EIO; + } + return 0; } -/* - * hdaps_readb_one - reads a byte from a single I/O port, placing the value in - * the given pointer. Returns zero on success or a negative error on failure. - * Can sleep. +/** + * hdaps_get_ec_config - get accelerometer parameters. + * @ec_rate: embedded controller sampling rate + * @order: embedded controller running average filter order + * Returns zero on success and negative error code on failure. Can sleep. */ -static int hdaps_readb_one(unsigned int port, u8 *val) +static int hdaps_get_ec_config(int *ec_rate, int *order) { - int ret; - - mutex_lock(&hdaps_mtx); - - /* do a sync refresh -- we need to be sure that we read fresh data */ - ret = __device_refresh_sync(); + const struct thinkpad_ec_row args = + { .mask = 0x0003, .val = {0x17, 0x82} }; + struct thinkpad_ec_row data = { .mask = 0x801F }; + int ret = thinkpad_ec_read_row(&args, &data); if (ret) - goto out; - - *val = inb(port); - __device_complete(); - -out: - mutex_unlock(&hdaps_mtx); - return ret; + return ret; + if (data.val[0xF] != 0x00) + return -EIO; + if (!(data.val[0x1] & 0x01)) + return -ENXIO; /* accelerometer polling not enabled */ + if (data.val[0x1] & 0x02) + return -EBUSY; /* config change in progress, retry later */ + *ec_rate = data.val[0x2] | ((int)(data.val[0x3]) << 8); + *order = data.val[0x4]; + return 0; } -/* __hdaps_read_pair - internal lockless helper for hdaps_read_pair(). */ -static int __hdaps_read_pair(unsigned int port1, unsigned int port2, - int *x, int *y) +/** + * hdaps_get_ec_mode - get EC accelerometer mode + * Returns zero on success and negative error code on failure. Can sleep. + */ +static int hdaps_get_ec_mode(u8 *mode) { - /* do a sync refresh -- we need to be sure that we read fresh data */ - if (__device_refresh_sync()) + const struct thinkpad_ec_row args = + { .mask = 0x0001, .val = {0x13} }; + struct thinkpad_ec_row data = { .mask = 0x8002 }; + int ret = thinkpad_ec_read_row(&args, &data); + if (ret) + return ret; + if (data.val[0xF] != 0x00) { + pr_warn("accelerometer not implemented (0x%02x)\n", + data.val[0xF]); return -EIO; - - *y = inw(port2); - *x = inw(port1); - km_activity = inb(HDAPS_PORT_KMACT); - __device_complete(); - - /* hdaps_invert is a bitvector to negate the axes */ - if (hdaps_invert & HDAPS_X_AXIS) - *x = -*x; - if (hdaps_invert & HDAPS_Y_AXIS) - *y = -*y; - + } + *mode = data.val[0x1]; return 0; } -/* - * hdaps_read_pair - reads the values from a pair of ports, placing the values - * in the given pointers. Returns zero on success. Can sleep. +/** + * hdaps_check_ec - checks something about the EC. + * Follows the clean-room spec for HDAPS; we don't know what it means. + * Returns zero on success and negative error code on failure. Can sleep. */ -static int hdaps_read_pair(unsigned int port1, unsigned int port2, - int *val1, int *val2) +static int hdaps_check_ec(void) { - int ret; - - mutex_lock(&hdaps_mtx); - ret = __hdaps_read_pair(port1, port2, val1, val2); - mutex_unlock(&hdaps_mtx); - - return ret; + const struct thinkpad_ec_row args = + { .mask = 0x0003, .val = {0x17, 0x81} }; + struct thinkpad_ec_row data = { .mask = 0x800E }; + int ret = thinkpad_ec_read_row(&args, &data); + if (ret) + return ret; + if (!((data.val[0x1] == 0x00 && data.val[0x2] == 0x60) || /* cleanroom spec */ + (data.val[0x1] == 0x01 && data.val[0x2] == 0x00)) || /* seen on T61 */ + data.val[0x3] != 0x00 || data.val[0xF] != 0x00) { + pr_warn("hdaps_check_ec: bad response (0x%x,0x%x,0x%x,0x%x)\n", + data.val[0x1], data.val[0x2], + data.val[0x3], data.val[0xF]); + return -EIO; + } + return 0; } -/* - * hdaps_device_init - initialize the accelerometer. Returns zero on success - * and negative error code on failure. Can sleep. +/** + * hdaps_device_init - initialize the accelerometer. + * + * Call several embedded controller functions to test and initialize the + * accelerometer. + * Returns zero on success and negative error code on failure. Can sleep. */ +#define FAILED_INIT(msg) pr_err("init failed at: %s\n", msg) static int hdaps_device_init(void) { - int total, ret = -ENXIO; + int ret; + u8 mode; - mutex_lock(&hdaps_mtx); + ret = thinkpad_ec_lock(); + if (ret) + return ret; - outb(0x13, 0x1610); - outb(0x01, 0x161f); - if (__wait_latch(0x161f, 0x00)) - goto out; + if (hdaps_get_ec_mode(&mode)) + { FAILED_INIT("hdaps_get_ec_mode failed"); goto bad; } - /* - * Most ThinkPads return 0x01. - * - * Others--namely the R50p, T41p, and T42p--return 0x03. These laptops - * have "inverted" axises. - * - * The 0x02 value occurs when the chip has been previously initialized. - */ - if (__check_latch(0x1611, 0x03) && - __check_latch(0x1611, 0x02) && - __check_latch(0x1611, 0x01)) - goto out; + pr_debug("initial mode latch is 0x%02x\n", mode); + if (mode == 0x00) + { FAILED_INIT("accelerometer not available"); goto bad; } - printk(KERN_DEBUG "hdaps: initial latch check good (0x%02x)\n", - __get_latch(0x1611)); + if (hdaps_check_ec()) + { FAILED_INIT("hdaps_check_ec failed"); goto bad; } - outb(0x17, 0x1610); - outb(0x81, 0x1611); - outb(0x01, 0x161f); - if (__wait_latch(0x161f, 0x00)) - goto out; - if (__wait_latch(0x1611, 0x00)) - goto out; - if (__wait_latch(0x1612, 0x60)) - goto out; - if (__wait_latch(0x1613, 0x00)) - goto out; - outb(0x14, 0x1610); - outb(0x01, 0x1611); - outb(0x01, 0x161f); - if (__wait_latch(0x161f, 0x00)) - goto out; - outb(0x10, 0x1610); - outb(0xc8, 0x1611); - outb(0x00, 0x1612); - outb(0x02, 0x1613); - outb(0x01, 0x161f); - if (__wait_latch(0x161f, 0x00)) - goto out; - if (__device_refresh_sync()) - goto out; - if (__wait_latch(0x1611, 0x00)) - goto out; + if (hdaps_set_power(1)) + { FAILED_INIT("hdaps_set_power failed"); goto bad; } - /* we have done our dance, now let's wait for the applause */ - for (total = INIT_TIMEOUT_MSECS; total > 0; total -= INIT_WAIT_MSECS) { - int x, y; - - /* a read of the device helps push it into action */ - __hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y); - if (!__wait_latch(0x1611, 0x02)) { - ret = 0; - break; - } + if (hdaps_set_ec_config(sampling_rate*oversampling_ratio, + running_avg_filter_order)) + { FAILED_INIT("hdaps_set_ec_config failed"); goto bad; } - msleep(INIT_WAIT_MSECS); - } + thinkpad_ec_invalidate(); + udelay(200); -out: - mutex_unlock(&hdaps_mtx); + /* Just prefetch instead of reading, to avoid ~1sec delay on load */ + ret = thinkpad_ec_prefetch_row(&ec_accel_args); + if (ret) + { FAILED_INIT("initial prefetch failed"); goto bad; } + goto good; +bad: + thinkpad_ec_invalidate(); + ret = -ENXIO; +good: + stale_readout = 1; + thinkpad_ec_unlock(); return ret; } +/** + * hdaps_device_shutdown - power off the accelerometer + * Returns nonzero on failure. Can sleep. + */ +static int hdaps_device_shutdown(void) +{ + int ret; + ret = hdaps_set_power(0); + if (ret) { + pr_warn("cannot power off\n"); + return ret; + } + ret = hdaps_set_ec_config(0, 1); + if (ret) + pr_warn("cannot stop EC sampling\n"); + return ret; +} /* Device model stuff */ @@ -306,13 +424,29 @@ } #ifdef CONFIG_PM_SLEEP +static int hdaps_suspend(struct device *dev) +{ + /* Don't do hdaps polls until resume re-initializes the sensor. */ + del_timer_sync(&hdaps_timer); + hdaps_device_shutdown(); /* ignore errors, effect is negligible */ + return 0; +} + static int hdaps_resume(struct device *dev) { - return hdaps_device_init(); + int ret = hdaps_device_init(); + if (ret) + return ret; + + mutex_lock(&hdaps_users_mtx); + if (hdaps_users) + mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate); + mutex_unlock(&hdaps_users_mtx); + return 0; } #endif -static SIMPLE_DEV_PM_OPS(hdaps_pm, NULL, hdaps_resume); +static SIMPLE_DEV_PM_OPS(hdaps_pm, hdaps_suspend, hdaps_resume); static struct platform_driver hdaps_driver = { .probe = hdaps_probe, @@ -322,30 +456,47 @@ }, }; -/* - * hdaps_calibrate - Set our "resting" values. Callers must hold hdaps_mtx. +/** + * hdaps_calibrate - set our "resting" values. + * Does its own locking. */ static void hdaps_calibrate(void) { - __hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &rest_x, &rest_y); + needs_calibration = 1; + hdaps_update(); + /* If that fails, the mousedev poll will take care of things later. */ } -static void hdaps_mousedev_poll(struct input_polled_dev *dev) +/* Timer handler for updating the input device. Runs in softirq context, + * so avoid lenghty or blocking operations. + */ +static void hdaps_mousedev_poll(unsigned long unused) { - struct input_dev *input_dev = dev->input; - int x, y; - - mutex_lock(&hdaps_mtx); + int ret; - if (__hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y)) - goto out; + stale_readout = 1; - input_report_abs(input_dev, ABS_X, x - rest_x); - input_report_abs(input_dev, ABS_Y, y - rest_y); - input_sync(input_dev); + /* Cannot sleep. Try nonblockingly. If we fail, try again later. */ + if (thinkpad_ec_try_lock()) + goto keep_active; + + ret = __hdaps_update(1); /* fast update, we're in softirq context */ + thinkpad_ec_unlock(); + /* Any of "successful", "not yet ready" and "not prefetched"? */ + if (ret != 0 && ret != -EBUSY && ret != -ENODATA) { + pr_err("poll failed, disabling updates\n"); + return; + } -out: - mutex_unlock(&hdaps_mtx); +keep_active: + /* Even if we failed now, pos_x,y may have been updated earlier: */ + input_report_abs(hdaps_idev, ABS_X, pos_x - rest_x); + input_report_abs(hdaps_idev, ABS_Y, pos_y - rest_y); + input_sync(hdaps_idev); + input_report_abs(hdaps_idev_raw, ABS_X, pos_x); + input_report_abs(hdaps_idev_raw, ABS_Y, pos_y); + input_sync(hdaps_idev_raw); + mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate); } @@ -354,65 +505,41 @@ static ssize_t hdaps_position_show(struct device *dev, struct device_attribute *attr, char *buf) { - int ret, x, y; - - ret = hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y); - if (ret) - return ret; - - return sprintf(buf, "(%d,%d)\n", x, y); -} - -static ssize_t hdaps_variance_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - int ret, x, y; - - ret = hdaps_read_pair(HDAPS_PORT_XVAR, HDAPS_PORT_YVAR, &x, &y); + int ret = hdaps_update(); if (ret) return ret; - - return sprintf(buf, "(%d,%d)\n", x, y); + return sprintf(buf, "(%d,%d)\n", pos_x, pos_y); } static ssize_t hdaps_temp1_show(struct device *dev, struct device_attribute *attr, char *buf) { - u8 uninitialized_var(temp); - int ret; - - ret = hdaps_readb_one(HDAPS_PORT_TEMP1, &temp); + int ret = hdaps_update(); if (ret) return ret; - - return sprintf(buf, "%u\n", temp); -} - -static ssize_t hdaps_temp2_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u8 uninitialized_var(temp); - int ret; - - ret = hdaps_readb_one(HDAPS_PORT_TEMP2, &temp); - if (ret) - return ret; - - return sprintf(buf, "%u\n", temp); + return sprintf(buf, "%d\n", temperature); } static ssize_t hdaps_keyboard_activity_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", KEYBD_ISSET(km_activity)); + int ret = hdaps_update(); + if (ret) + return ret; + return sprintf(buf, "%u\n", + get_jiffies_64() < last_keyboard_jiffies + KMACT_REMEMBER_PERIOD); } static ssize_t hdaps_mouse_activity_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", MOUSE_ISSET(km_activity)); + int ret = hdaps_update(); + if (ret) + return ret; + return sprintf(buf, "%u\n", + get_jiffies_64() < last_mouse_jiffies + KMACT_REMEMBER_PERIOD); } static ssize_t hdaps_calibrate_show(struct device *dev, @@ -425,10 +552,7 @@ struct device_attribute *attr, const char *buf, size_t count) { - mutex_lock(&hdaps_mtx); hdaps_calibrate(); - mutex_unlock(&hdaps_mtx); - return count; } @@ -445,7 +569,7 @@ int invert; if (sscanf(buf, "%d", &invert) != 1 || - invert < 0 || invert > HDAPS_BOTH_AXES) + invert < 0 || invert > HDAPS_ORIENT_MAX) return -EINVAL; hdaps_invert = invert; @@ -454,24 +578,128 @@ return count; } +static ssize_t hdaps_sampling_rate_show( + struct device *dev, struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", sampling_rate); +} + +static ssize_t hdaps_sampling_rate_store( + struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int rate, ret; + if (sscanf(buf, "%d", &rate) != 1 || rate > HZ || rate <= 0) { + pr_warn("must have 0ident); - return 1; -} - /* hdaps_dmi_match_invert - found an inverted match. */ static int __init hdaps_dmi_match_invert(const struct dmi_system_id *id) { - hdaps_invert = (unsigned long)id->driver_data; - pr_info("inverting axis (%u) readings\n", hdaps_invert); - return hdaps_dmi_match(id); + unsigned int orient = (kernel_ulong_t) id->driver_data; + hdaps_invert = orient; + pr_info("%s detected, setting orientation %u\n", id->ident, orient); + return 1; /* stop enumeration */ } -#define HDAPS_DMI_MATCH_INVERT(vendor, model, axes) { \ +#define HDAPS_DMI_MATCH_INVERT(vendor, model, orient) { \ .ident = vendor " " model, \ .callback = hdaps_dmi_match_invert, \ - .driver_data = (void *)axes, \ + .driver_data = (void *)(orient), \ .matches = { \ DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ DMI_MATCH(DMI_PRODUCT_VERSION, model) \ } \ } -#define HDAPS_DMI_MATCH_NORMAL(vendor, model) \ - HDAPS_DMI_MATCH_INVERT(vendor, model, 0) - -/* Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match - "ThinkPad T42p", so the order of the entries matters. - If your ThinkPad is not recognized, please update to latest - BIOS. This is especially the case for some R52 ThinkPads. */ -static struct dmi_system_id __initdata hdaps_whitelist[] = { - HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R50"), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R51"), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R52"), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61i", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T41"), - HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T42"), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T43"), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61p", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad X40"), - HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_Y_AXIS), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61s", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad Z60m"), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61m", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61p", HDAPS_BOTH_AXES), +/* List of models with abnormal axis configuration. + Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match + "ThinkPad T42p", and enumeration stops after first match, + so the order of the entries matters. */ +struct dmi_system_id __initdata hdaps_whitelist[] = { + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X40", HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R400", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R500", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60 Tablet", HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60s", HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400s", HDAPS_ORIENT_INVERT_X), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410s", HDAPS_ORIENT_SWAP), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T500", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T510", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad W51O", HDAPS_ORIENT_MAX), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201 Tablet", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X220", HDAPS_ORIENT_SWAP), { .ident = NULL } }; static int __init hdaps_init(void) { - struct input_dev *idev; int ret; - if (!dmi_check_system(hdaps_whitelist)) { - pr_warn("supported laptop not found!\n"); - ret = -ENODEV; - goto out; - } - - if (!request_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS, "hdaps")) { - ret = -ENXIO; - goto out; - } - + /* Determine axis orientation orientation */ + if (hdaps_invert == HDAPS_ORIENT_UNDEFINED) /* set by module param? */ + if (dmi_check_system(hdaps_whitelist) < 1) /* in whitelist? */ + hdaps_invert = 0; /* default */ + + /* Init timer before platform_driver_register, in case of suspend */ + init_timer(&hdaps_timer); + hdaps_timer.function = hdaps_mousedev_poll; ret = platform_driver_register(&hdaps_driver); if (ret) - goto out_region; + goto out; pdev = platform_device_register_simple("hdaps", -1, NULL, 0); if (IS_ERR(pdev)) { @@ -571,47 +792,79 @@ if (ret) goto out_device; - hdaps_idev = input_allocate_polled_device(); + hdaps_idev = input_allocate_device(); if (!hdaps_idev) { ret = -ENOMEM; goto out_group; } - hdaps_idev->poll = hdaps_mousedev_poll; - hdaps_idev->poll_interval = HDAPS_POLL_INTERVAL; + hdaps_idev_raw = input_allocate_device(); + if (!hdaps_idev_raw) { + ret = -ENOMEM; + goto out_idev_first; + } - /* initial calibrate for the input device */ - hdaps_calibrate(); + /* calibration for the input device (deferred to avoid delay) */ + needs_calibration = 1; - /* initialize the input class */ - idev = hdaps_idev->input; - idev->name = "hdaps"; - idev->phys = "isa1600/input0"; - idev->id.bustype = BUS_ISA; - idev->dev.parent = &pdev->dev; - idev->evbit[0] = BIT_MASK(EV_ABS); - input_set_abs_params(idev, ABS_X, + /* initialize the joystick-like fuzzed input device */ + hdaps_idev->name = "ThinkPad HDAPS joystick emulation"; + hdaps_idev->phys = "hdaps/input0"; + hdaps_idev->id.bustype = BUS_HOST; + hdaps_idev->id.vendor = HDAPS_INPUT_VENDOR; + hdaps_idev->id.product = HDAPS_INPUT_PRODUCT; + hdaps_idev->id.version = HDAPS_INPUT_JS_VERSION; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + hdaps_idev->cdev.dev = &pdev->dev; +#endif + hdaps_idev->evbit[0] = BIT(EV_ABS); + hdaps_idev->open = hdaps_mousedev_open; + hdaps_idev->close = hdaps_mousedev_close; + input_set_abs_params(hdaps_idev, ABS_X, -256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT); - input_set_abs_params(idev, ABS_Y, + input_set_abs_params(hdaps_idev, ABS_Y, -256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT); - ret = input_register_polled_device(hdaps_idev); + ret = input_register_device(hdaps_idev); if (ret) goto out_idev; - pr_info("driver successfully loaded\n"); + /* initialize the raw data input device */ + hdaps_idev_raw->name = "ThinkPad HDAPS accelerometer data"; + hdaps_idev_raw->phys = "hdaps/input1"; + hdaps_idev_raw->id.bustype = BUS_HOST; + hdaps_idev_raw->id.vendor = HDAPS_INPUT_VENDOR; + hdaps_idev_raw->id.product = HDAPS_INPUT_PRODUCT; + hdaps_idev_raw->id.version = HDAPS_INPUT_RAW_VERSION; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + hdaps_idev_raw->cdev.dev = &pdev->dev; +#endif + hdaps_idev_raw->evbit[0] = BIT(EV_ABS); + hdaps_idev_raw->open = hdaps_mousedev_open; + hdaps_idev_raw->close = hdaps_mousedev_close; + input_set_abs_params(hdaps_idev_raw, ABS_X, -32768, 32767, 0, 0); + input_set_abs_params(hdaps_idev_raw, ABS_Y, -32768, 32767, 0, 0); + + ret = input_register_device(hdaps_idev_raw); + if (ret) + goto out_idev_reg_first; + + pr_info("driver successfully loaded.\n"); return 0; +out_idev_reg_first: + input_unregister_device(hdaps_idev); out_idev: - input_free_polled_device(hdaps_idev); + input_free_device(hdaps_idev_raw); +out_idev_first: + input_free_device(hdaps_idev); out_group: sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group); out_device: platform_device_unregister(pdev); out_driver: platform_driver_unregister(&hdaps_driver); -out_region: - release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS); + hdaps_device_shutdown(); out: pr_warn("driver init failed (ret=%d)!\n", ret); return ret; @@ -619,12 +872,12 @@ static void __exit hdaps_exit(void) { - input_unregister_polled_device(hdaps_idev); - input_free_polled_device(hdaps_idev); + input_unregister_device(hdaps_idev_raw); + input_unregister_device(hdaps_idev); + hdaps_device_shutdown(); /* ignore errors, effect is negligible */ sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group); platform_device_unregister(pdev); platform_driver_unregister(&hdaps_driver); - release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS); pr_info("driver unloaded\n"); } @@ -632,9 +885,8 @@ module_init(hdaps_init); module_exit(hdaps_exit); -module_param_named(invert, hdaps_invert, int, 0); -MODULE_PARM_DESC(invert, "invert data along each axis. 1 invert x-axis, " - "2 invert y-axis, 3 invert both axes."); +module_param_named(invert, hdaps_invert, uint, 0); +MODULE_PARM_DESC(invert, "axis orientation code"); MODULE_AUTHOR("Robert Love"); MODULE_DESCRIPTION("IBM Hard Drive Active Protection System (HDAPS) driver"); diff -Nur linux-4.2/drivers/platform/x86/thinkpad_ec.c linux-4.2-pck/drivers/platform/x86/thinkpad_ec.c --- linux-4.2/drivers/platform/x86/thinkpad_ec.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/drivers/platform/x86/thinkpad_ec.c 2015-09-08 00:48:22.225030705 -0300 @@ -0,0 +1,513 @@ +/* + * thinkpad_ec.c - ThinkPad embedded controller LPC3 functions + * + * The embedded controller on ThinkPad laptops has a non-standard interface, + * where LPC channel 3 of the H8S EC chip is hooked up to IO ports + * 0x1600-0x161F and implements (a special case of) the H8S LPC protocol. + * The EC LPC interface provides various system management services (currently + * known: battery information and accelerometer readouts). This driver + * provides access and mutual exclusion for the EC interface. +* + * The LPC protocol and terminology are documented here: + * "H8S/2104B Group Hardware Manual", + * http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf + * + * Copyright (C) 2006-2007 Shem Multinymous + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) + #include +#else + #include +#endif + +#define TP_VERSION "0.41" + +MODULE_AUTHOR("Shem Multinymous"); +MODULE_DESCRIPTION("ThinkPad embedded controller hardware access"); +MODULE_VERSION(TP_VERSION); +MODULE_LICENSE("GPL"); + +/* IO ports used by embedded controller LPC channel 3: */ +#define TPC_BASE_PORT 0x1600 +#define TPC_NUM_PORTS 0x20 +#define TPC_STR3_PORT 0x1604 /* Reads H8S EC register STR3 */ +#define TPC_TWR0_PORT 0x1610 /* Mapped to H8S EC register TWR0MW/SW */ +#define TPC_TWR15_PORT 0x161F /* Mapped to H8S EC register TWR15. */ + /* (and port TPC_TWR0_PORT+i is mapped to H8S reg TWRi for 00x%02x", \ + msg, args->val[0x0], args->val[0xF], code) + +/* State of request prefetching: */ +static u8 prefetch_arg0, prefetch_argF; /* Args of last prefetch */ +static u64 prefetch_jiffies; /* time of prefetch, or: */ +#define TPC_PREFETCH_NONE INITIAL_JIFFIES /* No prefetch */ +#define TPC_PREFETCH_JUNK (INITIAL_JIFFIES+1) /* Ignore prefetch */ + +/* Locking: */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) +static DECLARE_MUTEX(thinkpad_ec_mutex); +#else +static DEFINE_SEMAPHORE(thinkpad_ec_mutex); +#endif + +/* Kludge in case the ACPI DSDT reserves the ports we need. */ +static bool force_io; /* Willing to do IO to ports we couldn't reserve? */ +static int reserved_io; /* Successfully reserved the ports? */ +module_param_named(force_io, force_io, bool, 0600); +MODULE_PARM_DESC(force_io, "Force IO even if region already reserved (0=off, 1=on)"); + +/** + * thinkpad_ec_lock - get lock on the ThinkPad EC + * + * Get exclusive lock for accesing the ThinkPad embedded controller LPC3 + * interface. Returns 0 iff lock acquired. + */ +int thinkpad_ec_lock(void) +{ + int ret; + ret = down_interruptible(&thinkpad_ec_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(thinkpad_ec_lock); + +/** + * thinkpad_ec_try_lock - try getting lock on the ThinkPad EC + * + * Try getting an exclusive lock for accesing the ThinkPad embedded + * controller LPC3. Returns immediately if lock is not available; neither + * blocks nor sleeps. Returns 0 iff lock acquired . + */ +int thinkpad_ec_try_lock(void) +{ + return down_trylock(&thinkpad_ec_mutex); +} +EXPORT_SYMBOL_GPL(thinkpad_ec_try_lock); + +/** + * thinkpad_ec_unlock - release lock on ThinkPad EC + * + * Release a previously acquired exclusive lock on the ThinkPad ebmedded + * controller LPC3 interface. + */ +void thinkpad_ec_unlock(void) +{ + up(&thinkpad_ec_mutex); +} +EXPORT_SYMBOL_GPL(thinkpad_ec_unlock); + +/** + * thinkpad_ec_request_row - tell embedded controller to prepare a row + * @args Input register arguments + * + * Requests a data row by writing to H8S LPC registers TRW0 through TWR15 (or + * a subset thereof) following the protocol prescribed by the "H8S/2104B Group + * Hardware Manual". Does sanity checks via status register STR3. + */ +static int thinkpad_ec_request_row(const struct thinkpad_ec_row *args) +{ + u8 str3; + int i; + + /* EC protocol requires write to TWR0 (function code): */ + if (!(args->mask & 0x0001)) { + printk(KERN_ERR MSG_FMT("bad args->mask=0x%02x", args->mask)); + return -EINVAL; + } + + /* Check initial STR3 status: */ + str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; + if (str3 & H8S_STR3_OBF3B) { /* data already pending */ + inb(TPC_TWR15_PORT); /* marks end of previous transaction */ + if (prefetch_jiffies == TPC_PREFETCH_NONE) + printk(KERN_WARNING REQ_FMT( + "EC has result from unrequested transaction", + str3)); + return -EBUSY; /* EC will be ready in a few usecs */ + } else if (str3 == H8S_STR3_SWMF) { /* busy with previous request */ + if (prefetch_jiffies == TPC_PREFETCH_NONE) + printk(KERN_WARNING REQ_FMT( + "EC is busy with unrequested transaction", + str3)); + return -EBUSY; /* data will be pending in a few usecs */ + } else if (str3 != 0x00) { /* unexpected status? */ + printk(KERN_WARNING REQ_FMT("unexpected initial STR3", str3)); + return -EIO; + } + + /* Send TWR0MW: */ + outb(args->val[0], TPC_TWR0_PORT); + str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; + if (str3 != H8S_STR3_MWMF) { /* not accepted? */ + printk(KERN_WARNING REQ_FMT("arg0 rejected", str3)); + return -EIO; + } + + /* Send TWR1 through TWR14: */ + for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) + if ((args->mask>>i)&1) + outb(args->val[i], TPC_TWR0_PORT+i); + + /* Send TWR15 (default to 0x01). This marks end of command. */ + outb((args->mask & 0x8000) ? args->val[0xF] : 0x01, TPC_TWR15_PORT); + + /* Wait until EC starts writing its reply (~60ns on average). + * Releasing locks before this happens may cause an EC hang + * due to firmware bug! + */ + for (i = 0; i < TPC_REQUEST_RETRIES; i++) { + str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; + if (str3 & H8S_STR3_SWMF) /* EC started replying */ + return 0; + else if (!(str3 & ~(H8S_STR3_IBF3B|H8S_STR3_MWMF))) + /* Normal progress (the EC hasn't seen the request + * yet, or is processing it). Wait it out. */ + ndelay(TPC_REQUEST_NDELAY); + else { /* weird EC status */ + printk(KERN_WARNING + REQ_FMT("bad end STR3", str3)); + return -EIO; + } + } + printk(KERN_WARNING REQ_FMT("EC is mysteriously silent", str3)); + return -EIO; +} + +/** + * thinkpad_ec_read_data - read pre-requested row-data from EC + * @args Input register arguments of pre-requested rows + * @data Output register values + * + * Reads current row data from the controller, assuming it's already + * requested. Follows the H8S spec for register access and status checks. + */ +static int thinkpad_ec_read_data(const struct thinkpad_ec_row *args, + struct thinkpad_ec_row *data) +{ + int i; + u8 str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; + /* Once we make a request, STR3 assumes the sequence of values listed + * in the following 'if' as it reads the request and writes its data. + * It takes about a few dozen nanosecs total, with very high variance. + */ + if (str3 == (H8S_STR3_IBF3B|H8S_STR3_MWMF) || + str3 == 0x00 || /* the 0x00 is indistinguishable from idle EC! */ + str3 == H8S_STR3_SWMF) + return -EBUSY; /* not ready yet */ + /* Finally, the EC signals output buffer full: */ + if (str3 != (H8S_STR3_OBF3B|H8S_STR3_SWMF)) { + printk(KERN_WARNING + REQ_FMT("bad initial STR3", str3)); + return -EIO; + } + + /* Read first byte (signals start of read transactions): */ + data->val[0] = inb(TPC_TWR0_PORT); + /* Optionally read 14 more bytes: */ + for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) + if ((data->mask >> i)&1) + data->val[i] = inb(TPC_TWR0_PORT+i); + /* Read last byte from 0x161F (signals end of read transaction): */ + data->val[0xF] = inb(TPC_TWR15_PORT); + + /* Readout still pending? */ + str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; + if (str3 & H8S_STR3_OBF3B) + printk(KERN_WARNING + REQ_FMT("OBF3B=1 after read", str3)); + /* If port 0x161F returns 0x80 too often, the EC may lock up. Warn: */ + if (data->val[0xF] == 0x80) + printk(KERN_WARNING + REQ_FMT("0x161F reports error", data->val[0xF])); + return 0; +} + +/** + * thinkpad_ec_is_row_fetched - is the given row currently prefetched? + * + * To keep things simple we compare only the first and last args; + * this suffices for all known cases. + */ +static int thinkpad_ec_is_row_fetched(const struct thinkpad_ec_row *args) +{ + return (prefetch_jiffies != TPC_PREFETCH_NONE) && + (prefetch_jiffies != TPC_PREFETCH_JUNK) && + (prefetch_arg0 == args->val[0]) && + (prefetch_argF == args->val[0xF]) && + (get_jiffies_64() < prefetch_jiffies + TPC_PREFETCH_TIMEOUT); +} + +/** + * thinkpad_ec_read_row - request and read data from ThinkPad EC + * @args Input register arguments + * @data Output register values + * + * Read a data row from the ThinkPad embedded controller LPC3 interface. + * Does fetching and retrying if needed. The row is specified by an + * array of 16 bytes, some of which may be undefined (but the first is + * mandatory). These bytes are given in @args->val[], where @args->val[i] is + * used iff (@args->mask>>i)&1). The resulting row data is stored in + * @data->val[], but is only guaranteed to be valid for indices corresponding + * to set bit in @data->mask. That is, if @data->mask&(1<val[i] is undefined. + * + * Returns -EBUSY on transient error and -EIO on abnormal condition. + * Caller must hold controller lock. + */ +int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, + struct thinkpad_ec_row *data) +{ + int retries, ret; + + if (thinkpad_ec_is_row_fetched(args)) + goto read_row; /* already requested */ + + /* Request the row */ + for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { + ret = thinkpad_ec_request_row(args); + if (!ret) + goto read_row; + if (ret != -EBUSY) + break; + ndelay(TPC_READ_NDELAY); + } + printk(KERN_ERR REQ_FMT("failed requesting row", ret)); + goto out; + +read_row: + /* Read the row's data */ + for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { + ret = thinkpad_ec_read_data(args, data); + if (!ret) + goto out; + if (ret != -EBUSY) + break; + ndelay(TPC_READ_NDELAY); + } + + printk(KERN_ERR REQ_FMT("failed waiting for data", ret)); + +out: + prefetch_jiffies = TPC_PREFETCH_JUNK; + return ret; +} +EXPORT_SYMBOL_GPL(thinkpad_ec_read_row); + +/** + * thinkpad_ec_try_read_row - try reading prefetched data from ThinkPad EC + * @args Input register arguments + * @data Output register values + * + * Try reading a data row from the ThinkPad embedded controller LPC3 + * interface, if this raw was recently prefetched using + * thinkpad_ec_prefetch_row(). Does not fetch, retry or block. + * The parameters have the same meaning as in thinkpad_ec_read_row(). + * + * Returns -EBUSY is data not ready and -ENODATA if row not prefetched. + * Caller must hold controller lock. + */ +int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, + struct thinkpad_ec_row *data) +{ + int ret; + if (!thinkpad_ec_is_row_fetched(args)) { + ret = -ENODATA; + } else { + ret = thinkpad_ec_read_data(args, data); + if (!ret) + prefetch_jiffies = TPC_PREFETCH_NONE; /* eaten up */ + } + return ret; +} +EXPORT_SYMBOL_GPL(thinkpad_ec_try_read_row); + +/** + * thinkpad_ec_prefetch_row - prefetch data from ThinkPad EC + * @args Input register arguments + * + * Prefetch a data row from the ThinkPad embedded controller LCP3 + * interface. A subsequent call to thinkpad_ec_read_row() with the + * same arguments will be faster, and a subsequent call to + * thinkpad_ec_try_read_row() stands a good chance of succeeding if + * done neither too soon nor too late. See + * thinkpad_ec_read_row() for the meaning of @args. + * + * Returns -EBUSY on transient error and -EIO on abnormal condition. + * Caller must hold controller lock. + */ +int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args) +{ + int ret; + ret = thinkpad_ec_request_row(args); + if (ret) { + prefetch_jiffies = TPC_PREFETCH_JUNK; + } else { + prefetch_jiffies = get_jiffies_64(); + prefetch_arg0 = args->val[0x0]; + prefetch_argF = args->val[0xF]; + } + return ret; +} +EXPORT_SYMBOL_GPL(thinkpad_ec_prefetch_row); + +/** + * thinkpad_ec_invalidate - invalidate prefetched ThinkPad EC data + * + * Invalidate the data prefetched via thinkpad_ec_prefetch_row() from the + * ThinkPad embedded controller LPC3 interface. + * Must be called before unlocking by any code that accesses the controller + * ports directly. + */ +void thinkpad_ec_invalidate(void) +{ + prefetch_jiffies = TPC_PREFETCH_JUNK; +} +EXPORT_SYMBOL_GPL(thinkpad_ec_invalidate); + + +/*** Checking for EC hardware ***/ + +/** + * thinkpad_ec_test - verify the EC is present and follows protocol + * + * Ensure the EC LPC3 channel really works on this machine by making + * an EC request and seeing if the EC follows the documented H8S protocol. + * The requested row just reads battery status, so it should be harmless to + * access it (on a correct EC). + * This test writes to IO ports, so execute only after checking DMI. + */ +static int __init thinkpad_ec_test(void) +{ + int ret; + const struct thinkpad_ec_row args = /* battery 0 basic status */ + { .mask = 0x8001, .val = {0x01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x00} }; + struct thinkpad_ec_row data = { .mask = 0x0000 }; + ret = thinkpad_ec_lock(); + if (ret) + return ret; + ret = thinkpad_ec_read_row(&args, &data); + thinkpad_ec_unlock(); + return ret; +} + +/* Search all DMI device names of a given type for a substring */ +static int __init dmi_find_substring(int type, const char *substr) +{ + const struct dmi_device *dev = NULL; + while ((dev = dmi_find_device(type, NULL, dev))) { + if (strstr(dev->name, substr)) + return 1; + } + return 0; +} + +#define TP_DMI_MATCH(vendor,model) { \ + .ident = vendor " " model, \ + .matches = { \ + DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ + DMI_MATCH(DMI_PRODUCT_VERSION, model) \ + } \ +} + +/* Check DMI for existence of ThinkPad embedded controller */ +static int __init check_dmi_for_ec(void) +{ + /* A few old models that have a good EC but don't report it in DMI */ + struct dmi_system_id tp_whitelist[] = { + TP_DMI_MATCH("IBM", "ThinkPad A30"), + TP_DMI_MATCH("IBM", "ThinkPad T23"), + TP_DMI_MATCH("IBM", "ThinkPad X24"), + TP_DMI_MATCH("LENOVO", "ThinkPad"), + { .ident = NULL } + }; + return dmi_find_substring(DMI_DEV_TYPE_OEM_STRING, + "IBM ThinkPad Embedded Controller") || + dmi_check_system(tp_whitelist); +} + +/*** Init and cleanup ***/ + +static int __init thinkpad_ec_init(void) +{ + if (!check_dmi_for_ec()) { + printk(KERN_WARNING + "thinkpad_ec: no ThinkPad embedded controller!\n"); + return -ENODEV; + } + + if (request_region(TPC_BASE_PORT, TPC_NUM_PORTS, "thinkpad_ec")) { + reserved_io = 1; + } else { + printk(KERN_ERR "thinkpad_ec: cannot claim IO ports %#x-%#x... ", + TPC_BASE_PORT, + TPC_BASE_PORT + TPC_NUM_PORTS - 1); + if (force_io) { + printk("forcing use of unreserved IO ports.\n"); + } else { + printk("consider using force_io=1.\n"); + return -ENXIO; + } + } + prefetch_jiffies = TPC_PREFETCH_JUNK; + if (thinkpad_ec_test()) { + printk(KERN_ERR "thinkpad_ec: initial ec test failed\n"); + if (reserved_io) + release_region(TPC_BASE_PORT, TPC_NUM_PORTS); + return -ENXIO; + } + printk(KERN_INFO "thinkpad_ec: thinkpad_ec " TP_VERSION " loaded.\n"); + return 0; +} + +static void __exit thinkpad_ec_exit(void) +{ + if (reserved_io) + release_region(TPC_BASE_PORT, TPC_NUM_PORTS); + printk(KERN_INFO "thinkpad_ec: unloaded.\n"); +} + +module_init(thinkpad_ec_init); +module_exit(thinkpad_ec_exit); diff -Nur linux-4.2/drivers/platform/x86/tp_smapi.c linux-4.2-pck/drivers/platform/x86/tp_smapi.c --- linux-4.2/drivers/platform/x86/tp_smapi.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/drivers/platform/x86/tp_smapi.c 2015-09-08 00:48:22.225030705 -0300 @@ -0,0 +1,1493 @@ +/* + * tp_smapi.c - ThinkPad SMAPI support + * + * This driver exposes some features of the System Management Application + * Program Interface (SMAPI) BIOS found on ThinkPad laptops. It works on + * models in which the SMAPI BIOS runs in SMM and is invoked by writing + * to the APM control port 0xB2. + * It also exposes battery status information, obtained from the ThinkPad + * embedded controller (via the thinkpad_ec module). + * Ancient ThinkPad models use a different interface, supported by the + * "thinkpad" module from "tpctl". + * + * Many of the battery status values obtained from the EC simply mirror + * values provided by the battery's Smart Battery System (SBS) interface, so + * their meaning is defined by the Smart Battery Data Specification (see + * http://sbs-forum.org/specs/sbdat110.pdf). References to this SBS spec + * are given in the code where relevant. + * + * Copyright (C) 2006 Shem Multinymous . + * SMAPI access code based on the mwave driver by Mike Sullivan. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include /* CMOS defines */ +#include +#include +#include +#include +#include +#include + +#define TP_VERSION "0.41" +#define TP_DESC "ThinkPad SMAPI Support" +#define TP_DIR "smapi" + +MODULE_AUTHOR("Shem Multinymous"); +MODULE_DESCRIPTION(TP_DESC); +MODULE_VERSION(TP_VERSION); +MODULE_LICENSE("GPL"); + +static struct platform_device *pdev; + +static int tp_debug; +module_param_named(debug, tp_debug, int, 0600); +MODULE_PARM_DESC(debug, "Debug level (0=off, 1=on)"); + +/* A few macros for printk()ing: */ +#define TPRINTK(level, fmt, args...) \ + dev_printk(level, &(pdev->dev), "%s: " fmt "\n", __func__, ## args) +#define DPRINTK(fmt, args...) \ + do { if (tp_debug) TPRINTK(KERN_DEBUG, fmt, ## args); } while (0) + +/********************************************************************* + * SMAPI interface + */ + +/* SMAPI functions (register BX when making the SMM call). */ +#define SMAPI_GET_INHIBIT_CHARGE 0x2114 +#define SMAPI_SET_INHIBIT_CHARGE 0x2115 +#define SMAPI_GET_THRESH_START 0x2116 +#define SMAPI_SET_THRESH_START 0x2117 +#define SMAPI_GET_FORCE_DISCHARGE 0x2118 +#define SMAPI_SET_FORCE_DISCHARGE 0x2119 +#define SMAPI_GET_THRESH_STOP 0x211a +#define SMAPI_SET_THRESH_STOP 0x211b + +/* SMAPI error codes (see ThinkPad 770 Technical Reference Manual p.83 at + http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD */ +#define SMAPI_RETCODE_EOF 0xff +static struct { u8 rc; char *msg; int ret; } smapi_retcode[] = +{ + {0x00, "OK", 0}, + {0x53, "SMAPI function is not available", -ENXIO}, + {0x81, "Invalid parameter", -EINVAL}, + {0x86, "Function is not supported by SMAPI BIOS", -EOPNOTSUPP}, + {0x90, "System error", -EIO}, + {0x91, "System is invalid", -EIO}, + {0x92, "System is busy, -EBUSY"}, + {0xa0, "Device error (disk read error)", -EIO}, + {0xa1, "Device is busy", -EBUSY}, + {0xa2, "Device is not attached", -ENXIO}, + {0xa3, "Device is disbled", -EIO}, + {0xa4, "Request parameter is out of range", -EINVAL}, + {0xa5, "Request parameter is not accepted", -EINVAL}, + {0xa6, "Transient error", -EBUSY}, /* ? */ + {SMAPI_RETCODE_EOF, "Unknown error code", -EIO} +}; + + +#define SMAPI_MAX_RETRIES 10 +#define SMAPI_PORT2 0x4F /* fixed port, meaning unclear */ +static unsigned short smapi_port; /* APM control port, normally 0xB2 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) +static DECLARE_MUTEX(smapi_mutex); +#else +static DEFINE_SEMAPHORE(smapi_mutex); +#endif + +/** + * find_smapi_port - read SMAPI port from NVRAM + */ +static int __init find_smapi_port(void) +{ + u16 smapi_id = 0; + unsigned short port = 0; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + smapi_id = CMOS_READ(0x7C); + smapi_id |= (CMOS_READ(0x7D) << 8); + spin_unlock_irqrestore(&rtc_lock, flags); + + if (smapi_id != 0x5349) { + printk(KERN_ERR "SMAPI not supported (ID=0x%x)\n", smapi_id); + return -ENXIO; + } + spin_lock_irqsave(&rtc_lock, flags); + port = CMOS_READ(0x7E); + port |= (CMOS_READ(0x7F) << 8); + spin_unlock_irqrestore(&rtc_lock, flags); + if (port == 0) { + printk(KERN_ERR "unable to read SMAPI port number\n"); + return -ENXIO; + } + return port; +} + +/** + * smapi_request - make a SMAPI call + * @inEBX, @inECX, @inEDI, @inESI: input registers + * @outEBX, @outECX, @outEDX, @outEDI, @outESI: outputs registers + * @msg: textual error message + * Invokes the SMAPI SMBIOS with the given input and outpu args. + * All outputs are optional (can be %NULL). + * Returns 0 when successful, and a negative errno constant + * (see smapi_retcode above) upon failure. + */ +static int smapi_request(u32 inEBX, u32 inECX, + u32 inEDI, u32 inESI, + u32 *outEBX, u32 *outECX, u32 *outEDX, + u32 *outEDI, u32 *outESI, const char **msg) +{ + int ret = 0; + int i; + int retries; + u8 rc; + /* Must use local vars for output regs, due to reg pressure. */ + u32 tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI; + + for (retries = 0; retries < SMAPI_MAX_RETRIES; ++retries) { + DPRINTK("req_in: BX=%x CX=%x DI=%x SI=%x", + inEBX, inECX, inEDI, inESI); + + /* SMAPI's SMBIOS call and thinkpad_ec end up using use + * different interfaces to the same chip, so play it safe. */ + ret = thinkpad_ec_lock(); + if (ret) + return ret; + + __asm__ __volatile__( + "movl $0x00005380,%%eax\n\t" + "movl %6,%%ebx\n\t" + "movl %7,%%ecx\n\t" + "movl %8,%%edi\n\t" + "movl %9,%%esi\n\t" + "xorl %%edx,%%edx\n\t" + "movw %10,%%dx\n\t" + "out %%al,%%dx\n\t" /* trigger SMI to SMBIOS */ + "out %%al,$0x4F\n\t" + "movl %%eax,%0\n\t" + "movl %%ebx,%1\n\t" + "movl %%ecx,%2\n\t" + "movl %%edx,%3\n\t" + "movl %%edi,%4\n\t" + "movl %%esi,%5\n\t" + :"=m"(tmpEAX), + "=m"(tmpEBX), + "=m"(tmpECX), + "=m"(tmpEDX), + "=m"(tmpEDI), + "=m"(tmpESI) + :"m"(inEBX), "m"(inECX), "m"(inEDI), "m"(inESI), + "m"((u16)smapi_port) + :"%eax", "%ebx", "%ecx", "%edx", "%edi", + "%esi"); + + thinkpad_ec_invalidate(); + thinkpad_ec_unlock(); + + /* Don't let the next SMAPI access happen too quickly, + * may case problems. (We're hold smapi_mutex). */ + msleep(50); + + if (outEBX) *outEBX = tmpEBX; + if (outECX) *outECX = tmpECX; + if (outEDX) *outEDX = tmpEDX; + if (outESI) *outESI = tmpESI; + if (outEDI) *outEDI = tmpEDI; + + /* Look up error code */ + rc = (tmpEAX>>8)&0xFF; + for (i = 0; smapi_retcode[i].rc != SMAPI_RETCODE_EOF && + smapi_retcode[i].rc != rc; ++i) {} + ret = smapi_retcode[i].ret; + if (msg) + *msg = smapi_retcode[i].msg; + + DPRINTK("req_out: AX=%x BX=%x CX=%x DX=%x DI=%x SI=%x r=%d", + tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI, ret); + if (ret) + TPRINTK(KERN_NOTICE, "SMAPI error: %s (func=%x)", + smapi_retcode[i].msg, inEBX); + + if (ret != -EBUSY) + return ret; + } + return ret; +} + +/* Convenience wrapper: discard output arguments */ +static int smapi_write(u32 inEBX, u32 inECX, + u32 inEDI, u32 inESI, const char **msg) +{ + return smapi_request(inEBX, inECX, inEDI, inESI, + NULL, NULL, NULL, NULL, NULL, msg); +} + + +/********************************************************************* + * Specific SMAPI services + * All of these functions return 0 upon success, and a negative errno + * constant (see smapi_retcode) on failure. + */ + +enum thresh_type { + THRESH_STOP = 0, /* the code assumes this is 0 for brevity */ + THRESH_START +}; +#define THRESH_NAME(which) ((which == THRESH_START) ? "start" : "stop") + +/** + * __get_real_thresh - read battery charge start/stop threshold from SMAPI + * @bat: battery number (0 or 1) + * @which: THRESH_START or THRESH_STOP + * @thresh: 1..99, 0=default 1..99, 0=default (pass this as-is to SMAPI) + * @outEDI: some additional state that needs to be preserved, meaning unknown + * @outESI: some additional state that needs to be preserved, meaning unknown + */ +static int __get_real_thresh(int bat, enum thresh_type which, int *thresh, + u32 *outEDI, u32 *outESI) +{ + u32 ebx = (which == THRESH_START) ? SMAPI_GET_THRESH_START + : SMAPI_GET_THRESH_STOP; + u32 ecx = (bat+1)<<8; + const char *msg; + int ret = smapi_request(ebx, ecx, 0, 0, NULL, + &ecx, NULL, outEDI, outESI, &msg); + if (ret) { + TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: %s", + THRESH_NAME(which), bat, msg); + return ret; + } + if (!(ecx&0x00000100)) { + TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: ecx=0%x", + THRESH_NAME(which), bat, ecx); + return -EIO; + } + if (thresh) + *thresh = ecx&0xFF; + return 0; +} + +/** + * get_real_thresh - read battery charge start/stop threshold from SMAPI + * @bat: battery number (0 or 1) + * @which: THRESH_START or THRESH_STOP + * @thresh: 1..99, 0=default (passes as-is to SMAPI) + */ +static int get_real_thresh(int bat, enum thresh_type which, int *thresh) +{ + return __get_real_thresh(bat, which, thresh, NULL, NULL); +} + +/** + * set_real_thresh - write battery start/top charge threshold to SMAPI + * @bat: battery number (0 or 1) + * @which: THRESH_START or THRESH_STOP + * @thresh: 1..99, 0=default (passes as-is to SMAPI) + */ +static int set_real_thresh(int bat, enum thresh_type which, int thresh) +{ + u32 ebx = (which == THRESH_START) ? SMAPI_SET_THRESH_START + : SMAPI_SET_THRESH_STOP; + u32 ecx = ((bat+1)<<8) + thresh; + u32 getDI, getSI; + const char *msg; + int ret; + + /* verify read before writing */ + ret = __get_real_thresh(bat, which, NULL, &getDI, &getSI); + if (ret) + return ret; + + ret = smapi_write(ebx, ecx, getDI, getSI, &msg); + if (ret) + TPRINTK(KERN_NOTICE, "set %s to %d for bat=%d failed: %s", + THRESH_NAME(which), thresh, bat, msg); + else + TPRINTK(KERN_INFO, "set %s to %d for bat=%d", + THRESH_NAME(which), thresh, bat); + return ret; +} + +/** + * __get_inhibit_charge_minutes - get inhibit charge period from SMAPI + * @bat: battery number (0 or 1) + * @minutes: period in minutes (1..65535 minutes, 0=disabled) + * @outECX: some additional state that needs to be preserved, meaning unknown + * Note that @minutes is the originally set value, it does not count down. + */ +static int __get_inhibit_charge_minutes(int bat, int *minutes, u32 *outECX) +{ + u32 ecx = (bat+1)<<8; + u32 esi; + const char *msg; + int ret = smapi_request(SMAPI_GET_INHIBIT_CHARGE, ecx, 0, 0, + NULL, &ecx, NULL, NULL, &esi, &msg); + if (ret) { + TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); + return ret; + } + if (!(ecx&0x0100)) { + TPRINTK(KERN_NOTICE, "bad ecx=0x%x for bat=%d", ecx, bat); + return -EIO; + } + if (minutes) + *minutes = (ecx&0x0001)?esi:0; + if (outECX) + *outECX = ecx; + return 0; +} + +/** + * get_inhibit_charge_minutes - get inhibit charge period from SMAPI + * @bat: battery number (0 or 1) + * @minutes: period in minutes (1..65535 minutes, 0=disabled) + * Note that @minutes is the originally set value, it does not count down. + */ +static int get_inhibit_charge_minutes(int bat, int *minutes) +{ + return __get_inhibit_charge_minutes(bat, minutes, NULL); +} + +/** + * set_inhibit_charge_minutes - write inhibit charge period to SMAPI + * @bat: battery number (0 or 1) + * @minutes: period in minutes (1..65535 minutes, 0=disabled) + */ +static int set_inhibit_charge_minutes(int bat, int minutes) +{ + u32 ecx; + const char *msg; + int ret; + + /* verify read before writing */ + ret = __get_inhibit_charge_minutes(bat, NULL, &ecx); + if (ret) + return ret; + + ecx = ((bat+1)<<8) | (ecx&0x00FE) | (minutes > 0 ? 0x0001 : 0x0000); + if (minutes > 0xFFFF) + minutes = 0xFFFF; + ret = smapi_write(SMAPI_SET_INHIBIT_CHARGE, ecx, 0, minutes, &msg); + if (ret) + TPRINTK(KERN_NOTICE, + "set to %d failed for bat=%d: %s", minutes, bat, msg); + else + TPRINTK(KERN_INFO, "set to %d for bat=%d\n", minutes, bat); + return ret; +} + + +/** + * get_force_discharge - get status of forced discharging from SMAPI + * @bat: battery number (0 or 1) + * @enabled: 1 if forced discharged is enabled, 0 if not + */ +static int get_force_discharge(int bat, int *enabled) +{ + u32 ecx = (bat+1)<<8; + const char *msg; + int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, + NULL, &ecx, NULL, NULL, NULL, &msg); + if (ret) { + TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); + return ret; + } + *enabled = (!(ecx&0x00000100) && (ecx&0x00000001))?1:0; + return 0; +} + +/** + * set_force_discharge - write status of forced discharging to SMAPI + * @bat: battery number (0 or 1) + * @enabled: 1 if forced discharged is enabled, 0 if not + */ +static int set_force_discharge(int bat, int enabled) +{ + u32 ecx = (bat+1)<<8; + const char *msg; + int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, + NULL, &ecx, NULL, NULL, NULL, &msg); + if (ret) { + TPRINTK(KERN_NOTICE, "get failed for bat=%d: %s", bat, msg); + return ret; + } + if (ecx&0x00000100) { + TPRINTK(KERN_NOTICE, "cannot force discharge bat=%d", bat); + return -EIO; + } + + ecx = ((bat+1)<<8) | (ecx&0x000000FA) | (enabled?0x00000001:0); + ret = smapi_write(SMAPI_SET_FORCE_DISCHARGE, ecx, 0, 0, &msg); + if (ret) + TPRINTK(KERN_NOTICE, "set to %d failed for bat=%d: %s", + enabled, bat, msg); + else + TPRINTK(KERN_INFO, "set to %d for bat=%d", enabled, bat); + return ret; +} + + +/********************************************************************* + * Wrappers to threshold-related SMAPI functions, which handle default + * thresholds and related quirks. + */ + +/* Minimum, default and minimum difference for battery charging thresholds: */ +#define MIN_THRESH_DELTA 4 /* Min delta between start and stop thresh */ +#define MIN_THRESH_START 2 +#define MAX_THRESH_START (100-MIN_THRESH_DELTA) +#define MIN_THRESH_STOP (MIN_THRESH_START + MIN_THRESH_DELTA) +#define MAX_THRESH_STOP 100 +#define DEFAULT_THRESH_START MAX_THRESH_START +#define DEFAULT_THRESH_STOP MAX_THRESH_STOP + +/* The GUI of IBM's Battery Maximizer seems to show a start threshold that + * is 1 more than the value we set/get via SMAPI. Since the threshold is + * maintained across reboot, this can be confusing. So we kludge our + * interface for interoperability: */ +#define BATMAX_FIX 1 + +/* Get charge start/stop threshold (1..100), + * substituting default values if needed and applying BATMAT_FIX. */ +static int get_thresh(int bat, enum thresh_type which, int *thresh) +{ + int ret = get_real_thresh(bat, which, thresh); + if (ret) + return ret; + if (*thresh == 0) + *thresh = (which == THRESH_START) ? DEFAULT_THRESH_START + : DEFAULT_THRESH_STOP; + else if (which == THRESH_START) + *thresh += BATMAX_FIX; + return 0; +} + + +/* Set charge start/stop threshold (1..100), + * substituting default values if needed and applying BATMAT_FIX. */ +static int set_thresh(int bat, enum thresh_type which, int thresh) +{ + if (which == THRESH_STOP && thresh == DEFAULT_THRESH_STOP) + thresh = 0; /* 100 is out of range, but default means 100 */ + if (which == THRESH_START) + thresh -= BATMAX_FIX; + return set_real_thresh(bat, which, thresh); +} + +/********************************************************************* + * ThinkPad embedded controller readout and basic functions + */ + +/** + * read_tp_ec_row - read data row from the ThinkPad embedded controller + * @arg0: EC command code + * @bat: battery number, 0 or 1 + * @j: the byte value to be used for "junk" (unused) input/outputs + * @dataval: result vector + */ +static int read_tp_ec_row(u8 arg0, int bat, u8 j, u8 *dataval) +{ + int ret; + const struct thinkpad_ec_row args = { .mask = 0xFFFF, + .val = {arg0, j,j,j,j,j,j,j,j,j,j,j,j,j,j, (u8)bat} }; + struct thinkpad_ec_row data = { .mask = 0xFFFF }; + + ret = thinkpad_ec_lock(); + if (ret) + return ret; + ret = thinkpad_ec_read_row(&args, &data); + thinkpad_ec_unlock(); + memcpy(dataval, &data.val, TP_CONTROLLER_ROW_LEN); + return ret; +} + +/** + * power_device_present - check for presence of battery or AC power + * @bat: 0 for battery 0, 1 for battery 1, otherwise AC power + * Returns 1 if present, 0 if not present, negative if error. + */ +static int power_device_present(int bat) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + u8 test; + int ret = read_tp_ec_row(1, bat, 0, row); + if (ret) + return ret; + switch (bat) { + case 0: test = 0x40; break; /* battery 0 */ + case 1: test = 0x20; break; /* battery 1 */ + default: test = 0x80; /* AC power */ + } + return (row[0] & test) ? 1 : 0; +} + +/** + * bat_has_status - check if battery can report detailed status + * @bat: 0 for battery 0, 1 for battery 1 + * Returns 1 if yes, 0 if no, negative if error. + */ +static int bat_has_status(int bat) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + int ret = read_tp_ec_row(1, bat, 0, row); + if (ret) + return ret; + if ((row[0] & (bat?0x20:0x40)) == 0) /* no battery */ + return 0; + if ((row[1] & (0x60)) == 0) /* no status */ + return 0; + return 1; +} + +/** + * get_tp_ec_bat_16 - read a 16-bit value from EC battery status data + * @arg0: first argument to EC + * @off: offset in row returned from EC + * @bat: battery (0 or 1) + * @val: the 16-bit value obtained + * Returns nonzero on error. + */ +static int get_tp_ec_bat_16(u8 arg0, int offset, int bat, u16 *val) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + int ret; + if (bat_has_status(bat) != 1) + return -ENXIO; + ret = read_tp_ec_row(arg0, bat, 0, row); + if (ret) + return ret; + *val = *(u16 *)(row+offset); + return 0; +} + +/********************************************************************* + * sysfs attributes for batteries - + * definitions and helper functions + */ + +/* A custom device attribute struct which holds a battery number */ +struct bat_device_attribute { + struct device_attribute dev_attr; + int bat; +}; + +/** + * attr_get_bat - get the battery to which the attribute belongs + */ +static int attr_get_bat(struct device_attribute *attr) +{ + return container_of(attr, struct bat_device_attribute, dev_attr)->bat; +} + +/** + * show_tp_ec_bat_u16 - show an unsigned 16-bit battery attribute + * @arg0: specified 1st argument of EC raw to read + * @offset: byte offset in EC raw data + * @mul: correction factor to multiply by + * @na_msg: string to output is value not available (0xFFFFFFFF) + * @attr: battery attribute + * @buf: output buffer + * The 16-bit value is read from the EC, treated as unsigned, + * transformed as x->mul*x, and printed to the buffer. + * If the value is 0xFFFFFFFF and na_msg!=%NULL, na_msg is printed instead. + */ +static ssize_t show_tp_ec_bat_u16(u8 arg0, int offset, int mul, + const char *na_msg, + struct device_attribute *attr, char *buf) +{ + u16 val; + int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); + if (ret) + return ret; + if (na_msg && val == 0xFFFF) + return sprintf(buf, "%s\n", na_msg); + else + return sprintf(buf, "%u\n", mul*(unsigned int)val); +} + +/** + * show_tp_ec_bat_s16 - show an signed 16-bit battery attribute + * @arg0: specified 1st argument of EC raw to read + * @offset: byte offset in EC raw data + * @mul: correction factor to multiply by + * @add: correction term to add after multiplication + * @attr: battery attribute + * @buf: output buffer + * The 16-bit value is read from the EC, treated as signed, + * transformed as x->mul*x+add, and printed to the buffer. + */ +static ssize_t show_tp_ec_bat_s16(u8 arg0, int offset, int mul, int add, + struct device_attribute *attr, char *buf) +{ + u16 val; + int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); + if (ret) + return ret; + return sprintf(buf, "%d\n", mul*(s16)val+add); +} + +/** + * show_tp_ec_bat_str - show a string from EC battery status data + * @arg0: specified 1st argument of EC raw to read + * @offset: byte offset in EC raw data + * @maxlen: maximum string length + * @attr: battery attribute + * @buf: output buffer + */ +static ssize_t show_tp_ec_bat_str(u8 arg0, int offset, int maxlen, + struct device_attribute *attr, char *buf) +{ + int bat = attr_get_bat(attr); + u8 row[TP_CONTROLLER_ROW_LEN]; + int ret; + if (bat_has_status(bat) != 1) + return -ENXIO; + ret = read_tp_ec_row(arg0, bat, 0, row); + if (ret) + return ret; + strncpy(buf, (char *)row+offset, maxlen); + buf[maxlen] = 0; + strcat(buf, "\n"); + return strlen(buf); +} + +/** + * show_tp_ec_bat_power - show a power readout from EC battery status data + * @arg0: specified 1st argument of EC raw to read + * @offV: byte offset of voltage in EC raw data + * @offI: byte offset of current in EC raw data + * @attr: battery attribute + * @buf: output buffer + * Computes the power as current*voltage from the two given readout offsets. + */ +static ssize_t show_tp_ec_bat_power(u8 arg0, int offV, int offI, + struct device_attribute *attr, char *buf) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + int milliamp, millivolt, ret; + int bat = attr_get_bat(attr); + if (bat_has_status(bat) != 1) + return -ENXIO; + ret = read_tp_ec_row(1, bat, 0, row); + if (ret) + return ret; + millivolt = *(u16 *)(row+offV); + milliamp = *(s16 *)(row+offI); + return sprintf(buf, "%d\n", milliamp*millivolt/1000); /* units: mW */ +} + +/** + * show_tp_ec_bat_date - decode and show a date from EC battery status data + * @arg0: specified 1st argument of EC raw to read + * @offset: byte offset in EC raw data + * @attr: battery attribute + * @buf: output buffer + */ +static ssize_t show_tp_ec_bat_date(u8 arg0, int offset, + struct device_attribute *attr, char *buf) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + u16 v; + int ret; + int day, month, year; + int bat = attr_get_bat(attr); + if (bat_has_status(bat) != 1) + return -ENXIO; + ret = read_tp_ec_row(arg0, bat, 0, row); + if (ret) + return ret; + + /* Decode bit-packed: v = day | (month<<5) | ((year-1980)<<9) */ + v = *(u16 *)(row+offset); + day = v & 0x1F; + month = (v >> 5) & 0xF; + year = (v >> 9) + 1980; + + return sprintf(buf, "%04d-%02d-%02d\n", year, month, day); +} + + +/********************************************************************* + * sysfs attribute I/O for batteries - + * the actual attribute show/store functions + */ + +static ssize_t show_battery_start_charge_thresh(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int thresh; + int bat = attr_get_bat(attr); + int ret = get_thresh(bat, THRESH_START, &thresh); + if (ret) + return ret; + return sprintf(buf, "%d\n", thresh); /* units: percent */ +} + +static ssize_t show_battery_stop_charge_thresh(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int thresh; + int bat = attr_get_bat(attr); + int ret = get_thresh(bat, THRESH_STOP, &thresh); + if (ret) + return ret; + return sprintf(buf, "%d\n", thresh); /* units: percent */ +} + +/** + * store_battery_start_charge_thresh - store battery_start_charge_thresh attr + * Since this is a kernel<->user interface, we ensure a valid state for + * the hardware. We do this by clamping the requested threshold to the + * valid range and, if necessary, moving the other threshold so that + * it's MIN_THRESH_DELTA away from this one. + */ +static ssize_t store_battery_start_charge_thresh(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int thresh, other_thresh, ret; + int bat = attr_get_bat(attr); + + if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) + return -EINVAL; + + if (thresh < MIN_THRESH_START) /* clamp up to MIN_THRESH_START */ + thresh = MIN_THRESH_START; + if (thresh > MAX_THRESH_START) /* clamp down to MAX_THRESH_START */ + thresh = MAX_THRESH_START; + + down(&smapi_mutex); + ret = get_thresh(bat, THRESH_STOP, &other_thresh); + if (ret != -EOPNOTSUPP && ret != -ENXIO) { + if (ret) /* other threshold is set? */ + goto out; + ret = get_real_thresh(bat, THRESH_START, NULL); + if (ret) /* this threshold is set? */ + goto out; + if (other_thresh < thresh+MIN_THRESH_DELTA) { + /* move other thresh to keep it above this one */ + ret = set_thresh(bat, THRESH_STOP, + thresh+MIN_THRESH_DELTA); + if (ret) + goto out; + } + } + ret = set_thresh(bat, THRESH_START, thresh); +out: + up(&smapi_mutex); + return count; + +} + +/** + * store_battery_stop_charge_thresh - store battery_stop_charge_thresh attr + * Since this is a kernel<->user interface, we ensure a valid state for + * the hardware. We do this by clamping the requested threshold to the + * valid range and, if necessary, moving the other threshold so that + * it's MIN_THRESH_DELTA away from this one. + */ +static ssize_t store_battery_stop_charge_thresh(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int thresh, other_thresh, ret; + int bat = attr_get_bat(attr); + + if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) + return -EINVAL; + + if (thresh < MIN_THRESH_STOP) /* clamp up to MIN_THRESH_STOP */ + thresh = MIN_THRESH_STOP; + + down(&smapi_mutex); + ret = get_thresh(bat, THRESH_START, &other_thresh); + if (ret != -EOPNOTSUPP && ret != -ENXIO) { /* other threshold exists? */ + if (ret) + goto out; + /* this threshold exists? */ + ret = get_real_thresh(bat, THRESH_STOP, NULL); + if (ret) + goto out; + if (other_thresh >= thresh-MIN_THRESH_DELTA) { + /* move other thresh to be below this one */ + ret = set_thresh(bat, THRESH_START, + thresh-MIN_THRESH_DELTA); + if (ret) + goto out; + } + } + ret = set_thresh(bat, THRESH_STOP, thresh); +out: + up(&smapi_mutex); + return count; +} + +static ssize_t show_battery_inhibit_charge_minutes(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int minutes; + int bat = attr_get_bat(attr); + int ret = get_inhibit_charge_minutes(bat, &minutes); + if (ret) + return ret; + return sprintf(buf, "%d\n", minutes); /* units: minutes */ +} + +static ssize_t store_battery_inhibit_charge_minutes(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + int minutes; + int bat = attr_get_bat(attr); + if (sscanf(buf, "%d", &minutes) != 1 || minutes < 0) { + TPRINTK(KERN_ERR, "inhibit_charge_minutes: " + "must be a non-negative integer"); + return -EINVAL; + } + ret = set_inhibit_charge_minutes(bat, minutes); + if (ret) + return ret; + return count; +} + +static ssize_t show_battery_force_discharge(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int enabled; + int bat = attr_get_bat(attr); + int ret = get_force_discharge(bat, &enabled); + if (ret) + return ret; + return sprintf(buf, "%d\n", enabled); /* type: boolean */ +} + +static ssize_t store_battery_force_discharge(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int ret; + int enabled; + int bat = attr_get_bat(attr); + if (sscanf(buf, "%d", &enabled) != 1 || enabled < 0 || enabled > 1) + return -EINVAL; + ret = set_force_discharge(bat, enabled); + if (ret) + return ret; + return count; +} + +static ssize_t show_battery_installed( + struct device *dev, struct device_attribute *attr, char *buf) +{ + int bat = attr_get_bat(attr); + int ret = power_device_present(bat); + if (ret < 0) + return ret; + return sprintf(buf, "%d\n", ret); /* type: boolean */ +} + +static ssize_t show_battery_state( + struct device *dev, struct device_attribute *attr, char *buf) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + const char *txt; + int ret; + int bat = attr_get_bat(attr); + if (bat_has_status(bat) != 1) + return sprintf(buf, "none\n"); + ret = read_tp_ec_row(1, bat, 0, row); + if (ret) + return ret; + switch (row[1] & 0xf0) { + case 0xc0: txt = "idle"; break; + case 0xd0: txt = "discharging"; break; + case 0xe0: txt = "charging"; break; + default: return sprintf(buf, "unknown (0x%x)\n", row[1]); + } + return sprintf(buf, "%s\n", txt); /* type: string from fixed set */ +} + +static ssize_t show_battery_manufacturer( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: string. SBS spec v1.1 p34: ManufacturerName() */ + return show_tp_ec_bat_str(4, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); +} + +static ssize_t show_battery_model( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: string. SBS spec v1.1 p34: DeviceName() */ + return show_tp_ec_bat_str(5, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); +} + +static ssize_t show_battery_barcoding( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: string */ + return show_tp_ec_bat_str(7, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); +} + +static ssize_t show_battery_chemistry( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: string. SBS spec v1.1 p34-35: DeviceChemistry() */ + return show_tp_ec_bat_str(6, 2, 5, attr, buf); +} + +static ssize_t show_battery_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV. SBS spec v1.1 p24: Voltage() */ + return show_tp_ec_bat_u16(1, 6, 1, NULL, attr, buf); +} + +static ssize_t show_battery_design_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV. SBS spec v1.1 p32: DesignVoltage() */ + return show_tp_ec_bat_u16(3, 4, 1, NULL, attr, buf); +} + +static ssize_t show_battery_charging_max_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV. SBS spec v1.1 p37,39: ChargingVoltage() */ + return show_tp_ec_bat_u16(9, 8, 1, NULL, attr, buf); +} + +static ssize_t show_battery_group0_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV */ + return show_tp_ec_bat_u16(0xA, 12, 1, NULL, attr, buf); +} + +static ssize_t show_battery_group1_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV */ + return show_tp_ec_bat_u16(0xA, 10, 1, NULL, attr, buf); +} + +static ssize_t show_battery_group2_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV */ + return show_tp_ec_bat_u16(0xA, 8, 1, NULL, attr, buf); +} + +static ssize_t show_battery_group3_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV */ + return show_tp_ec_bat_u16(0xA, 6, 1, NULL, attr, buf); +} + +static ssize_t show_battery_current_now( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mA. SBS spec v1.1 p24: Current() */ + return show_tp_ec_bat_s16(1, 8, 1, 0, attr, buf); +} + +static ssize_t show_battery_current_avg( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mA. SBS spec v1.1 p24: AverageCurrent() */ + return show_tp_ec_bat_s16(1, 10, 1, 0, attr, buf); +} + +static ssize_t show_battery_charging_max_current( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mA. SBS spec v1.1 p36,38: ChargingCurrent() */ + return show_tp_ec_bat_s16(9, 6, 1, 0, attr, buf); +} + +static ssize_t show_battery_power_now( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mW. SBS spec v1.1: Voltage()*Current() */ + return show_tp_ec_bat_power(1, 6, 8, attr, buf); +} + +static ssize_t show_battery_power_avg( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mW. SBS spec v1.1: Voltage()*AverageCurrent() */ + return show_tp_ec_bat_power(1, 6, 10, attr, buf); +} + +static ssize_t show_battery_remaining_percent( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: percent. SBS spec v1.1 p25: RelativeStateOfCharge() */ + return show_tp_ec_bat_u16(1, 12, 1, NULL, attr, buf); +} + +static ssize_t show_battery_remaining_percent_error( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: percent. SBS spec v1.1 p25: MaxError() */ + return show_tp_ec_bat_u16(9, 4, 1, NULL, attr, buf); +} + +static ssize_t show_battery_remaining_charging_time( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: minutes. SBS spec v1.1 p27: AverageTimeToFull() */ + return show_tp_ec_bat_u16(2, 8, 1, "not_charging", attr, buf); +} + +static ssize_t show_battery_remaining_running_time( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ + return show_tp_ec_bat_u16(2, 6, 1, "not_discharging", attr, buf); +} + +static ssize_t show_battery_remaining_running_time_now( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ + return show_tp_ec_bat_u16(2, 4, 1, "not_discharging", attr, buf); +} + +static ssize_t show_battery_remaining_capacity( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mWh. SBS spec v1.1 p26. */ + return show_tp_ec_bat_u16(1, 14, 10, "", attr, buf); +} + +static ssize_t show_battery_last_full_capacity( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mWh. SBS spec v1.1 p26: FullChargeCapacity() */ + return show_tp_ec_bat_u16(2, 2, 10, "", attr, buf); +} + +static ssize_t show_battery_design_capacity( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mWh. SBS spec v1.1 p32: DesignCapacity() */ + return show_tp_ec_bat_u16(3, 2, 10, "", attr, buf); +} + +static ssize_t show_battery_cycle_count( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: ordinal. SBS spec v1.1 p32: CycleCount() */ + return show_tp_ec_bat_u16(2, 12, 1, "", attr, buf); +} + +static ssize_t show_battery_temperature( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: millicelsius. SBS spec v1.1: Temperature()*10 */ + return show_tp_ec_bat_s16(1, 4, 100, -273100, attr, buf); +} + +static ssize_t show_battery_serial( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: int. SBS spec v1.1 p34: SerialNumber() */ + return show_tp_ec_bat_u16(3, 10, 1, "", attr, buf); +} + +static ssize_t show_battery_manufacture_date( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: YYYY-MM-DD. SBS spec v1.1 p34: ManufactureDate() */ + return show_tp_ec_bat_date(3, 8, attr, buf); +} + +static ssize_t show_battery_first_use_date( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: YYYY-MM-DD */ + return show_tp_ec_bat_date(8, 2, attr, buf); +} + +/** + * show_battery_dump - show the battery's dump attribute + * The dump attribute gives a hex dump of all EC readouts related to a + * battery. Some of the enumerated values don't really exist (i.e., the + * EC function just leaves them untouched); we use a kludge to detect and + * denote these. + */ +#define MIN_DUMP_ARG0 0x00 +#define MAX_DUMP_ARG0 0x0a /* 0x0b is useful too but hangs old EC firmware */ +static ssize_t show_battery_dump( + struct device *dev, struct device_attribute *attr, char *buf) +{ + int i; + char *p = buf; + int bat = attr_get_bat(attr); + u8 arg0; /* first argument to EC */ + u8 rowa[TP_CONTROLLER_ROW_LEN], + rowb[TP_CONTROLLER_ROW_LEN]; + const u8 junka = 0xAA, + junkb = 0x55; /* junk values for testing changes */ + int ret; + + for (arg0 = MIN_DUMP_ARG0; arg0 <= MAX_DUMP_ARG0; ++arg0) { + if ((p-buf) > PAGE_SIZE-TP_CONTROLLER_ROW_LEN*5) + return -ENOMEM; /* don't overflow sysfs buf */ + /* Read raw twice with different junk values, + * to detect unused output bytes which are left unchaged: */ + ret = read_tp_ec_row(arg0, bat, junka, rowa); + if (ret) + return ret; + ret = read_tp_ec_row(arg0, bat, junkb, rowb); + if (ret) + return ret; + for (i = 0; i < TP_CONTROLLER_ROW_LEN; i++) { + if (rowa[i] == junka && rowb[i] == junkb) + p += sprintf(p, "-- "); /* unused by EC */ + else + p += sprintf(p, "%02x ", rowa[i]); + } + p += sprintf(p, "\n"); + } + return p-buf; +} + + +/********************************************************************* + * sysfs attribute I/O, other than batteries + */ + +static ssize_t show_ac_connected( + struct device *dev, struct device_attribute *attr, char *buf) +{ + int ret = power_device_present(0xFF); + if (ret < 0) + return ret; + return sprintf(buf, "%d\n", ret); /* type: boolean */ +} + +/********************************************************************* + * The the "smapi_request" sysfs attribute executes a raw SMAPI call. + * You write to make a request and read to get the result. The state + * is saved globally rather than per fd (sysfs limitation), so + * simultaenous requests may get each other's results! So this is for + * development and debugging only. + */ +#define MAX_SMAPI_ATTR_ANSWER_LEN 128 +static char smapi_attr_answer[MAX_SMAPI_ATTR_ANSWER_LEN] = ""; + +static ssize_t show_smapi_request(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int ret = snprintf(buf, PAGE_SIZE, "%s", smapi_attr_answer); + smapi_attr_answer[0] = '\0'; + return ret; +} + +static ssize_t store_smapi_request(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int inEBX, inECX, inEDI, inESI; + u32 outEBX, outECX, outEDX, outEDI, outESI; + const char *msg; + int ret; + if (sscanf(buf, "%x %x %x %x", &inEBX, &inECX, &inEDI, &inESI) != 4) { + smapi_attr_answer[0] = '\0'; + return -EINVAL; + } + ret = smapi_request( + inEBX, inECX, inEDI, inESI, + &outEBX, &outECX, &outEDX, &outEDI, &outESI, &msg); + snprintf(smapi_attr_answer, MAX_SMAPI_ATTR_ANSWER_LEN, + "%x %x %x %x %x %d '%s'\n", + (unsigned int)outEBX, (unsigned int)outECX, + (unsigned int)outEDX, (unsigned int)outEDI, + (unsigned int)outESI, ret, msg); + if (ret) + return ret; + else + return count; +} + +/********************************************************************* + * Power management: the embedded controller forgets the battery + * thresholds when the system is suspended to disk and unplugged from + * AC and battery, so we restore it upon resume. + */ + +static int saved_threshs[4] = {-1, -1, -1, -1}; /* -1 = don't know */ + +static int tp_suspend(struct platform_device *dev, pm_message_t state) +{ + int restore = (state.event == PM_EVENT_HIBERNATE || + state.event == PM_EVENT_FREEZE); + if (!restore || get_real_thresh(0, THRESH_STOP , &saved_threshs[0])) + saved_threshs[0] = -1; + if (!restore || get_real_thresh(0, THRESH_START, &saved_threshs[1])) + saved_threshs[1] = -1; + if (!restore || get_real_thresh(1, THRESH_STOP , &saved_threshs[2])) + saved_threshs[2] = -1; + if (!restore || get_real_thresh(1, THRESH_START, &saved_threshs[3])) + saved_threshs[3] = -1; + DPRINTK("suspend saved: %d %d %d %d", saved_threshs[0], + saved_threshs[1], saved_threshs[2], saved_threshs[3]); + return 0; +} + +static int tp_resume(struct platform_device *dev) +{ + DPRINTK("resume restoring: %d %d %d %d", saved_threshs[0], + saved_threshs[1], saved_threshs[2], saved_threshs[3]); + if (saved_threshs[0] >= 0) + set_real_thresh(0, THRESH_STOP , saved_threshs[0]); + if (saved_threshs[1] >= 0) + set_real_thresh(0, THRESH_START, saved_threshs[1]); + if (saved_threshs[2] >= 0) + set_real_thresh(1, THRESH_STOP , saved_threshs[2]); + if (saved_threshs[3] >= 0) + set_real_thresh(1, THRESH_START, saved_threshs[3]); + return 0; +} + + +/********************************************************************* + * Driver model + */ + +static struct platform_driver tp_driver = { + .suspend = tp_suspend, + .resume = tp_resume, + .driver = { + .name = "smapi", + .owner = THIS_MODULE + }, +}; + + +/********************************************************************* + * Sysfs device model + */ + +/* Attributes in /sys/devices/platform/smapi/ */ + +static DEVICE_ATTR(ac_connected, 0444, show_ac_connected, NULL); +static DEVICE_ATTR(smapi_request, 0600, show_smapi_request, + store_smapi_request); + +static struct attribute *tp_root_attributes[] = { + &dev_attr_ac_connected.attr, + &dev_attr_smapi_request.attr, + NULL +}; +static struct attribute_group tp_root_attribute_group = { + .attrs = tp_root_attributes +}; + +/* Attributes under /sys/devices/platform/smapi/BAT{0,1}/ : + * Every attribute needs to be defined (i.e., statically allocated) for + * each battery, and then referenced in the attribute list of each battery. + * We use preprocessor voodoo to avoid duplicating the list of attributes 4 + * times. The preprocessor output is just normal sysfs attributes code. + */ + +/** + * FOREACH_BAT_ATTR - invoke the given macros on all our battery attributes + * @_BAT: battery number (0 or 1) + * @_ATTR_RW: macro to invoke for each read/write attribute + * @_ATTR_R: macro to invoke for each read-only attribute + */ +#define FOREACH_BAT_ATTR(_BAT, _ATTR_RW, _ATTR_R) \ + _ATTR_RW(_BAT, start_charge_thresh) \ + _ATTR_RW(_BAT, stop_charge_thresh) \ + _ATTR_RW(_BAT, inhibit_charge_minutes) \ + _ATTR_RW(_BAT, force_discharge) \ + _ATTR_R(_BAT, installed) \ + _ATTR_R(_BAT, state) \ + _ATTR_R(_BAT, manufacturer) \ + _ATTR_R(_BAT, model) \ + _ATTR_R(_BAT, barcoding) \ + _ATTR_R(_BAT, chemistry) \ + _ATTR_R(_BAT, voltage) \ + _ATTR_R(_BAT, group0_voltage) \ + _ATTR_R(_BAT, group1_voltage) \ + _ATTR_R(_BAT, group2_voltage) \ + _ATTR_R(_BAT, group3_voltage) \ + _ATTR_R(_BAT, current_now) \ + _ATTR_R(_BAT, current_avg) \ + _ATTR_R(_BAT, charging_max_current) \ + _ATTR_R(_BAT, power_now) \ + _ATTR_R(_BAT, power_avg) \ + _ATTR_R(_BAT, remaining_percent) \ + _ATTR_R(_BAT, remaining_percent_error) \ + _ATTR_R(_BAT, remaining_charging_time) \ + _ATTR_R(_BAT, remaining_running_time) \ + _ATTR_R(_BAT, remaining_running_time_now) \ + _ATTR_R(_BAT, remaining_capacity) \ + _ATTR_R(_BAT, last_full_capacity) \ + _ATTR_R(_BAT, design_voltage) \ + _ATTR_R(_BAT, charging_max_voltage) \ + _ATTR_R(_BAT, design_capacity) \ + _ATTR_R(_BAT, cycle_count) \ + _ATTR_R(_BAT, temperature) \ + _ATTR_R(_BAT, serial) \ + _ATTR_R(_BAT, manufacture_date) \ + _ATTR_R(_BAT, first_use_date) \ + _ATTR_R(_BAT, dump) + +/* Define several macros we will feed into FOREACH_BAT_ATTR: */ + +#define DEFINE_BAT_ATTR_RW(_BAT,_NAME) \ + static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ + .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, \ + store_battery_##_NAME), \ + .bat = _BAT \ + }; + +#define DEFINE_BAT_ATTR_R(_BAT,_NAME) \ + static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ + .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, 0), \ + .bat = _BAT \ + }; + +#define REF_BAT_ATTR(_BAT,_NAME) \ + &dev_attr_##_NAME##_##_BAT.dev_attr.attr, + +/* This provide all attributes for one battery: */ + +#define PROVIDE_BAT_ATTRS(_BAT) \ + FOREACH_BAT_ATTR(_BAT, DEFINE_BAT_ATTR_RW, DEFINE_BAT_ATTR_R) \ + static struct attribute *tp_bat##_BAT##_attributes[] = { \ + FOREACH_BAT_ATTR(_BAT, REF_BAT_ATTR, REF_BAT_ATTR) \ + NULL \ + }; \ + static struct attribute_group tp_bat##_BAT##_attribute_group = { \ + .name = "BAT" #_BAT, \ + .attrs = tp_bat##_BAT##_attributes \ + }; + +/* Finally genereate the attributes: */ + +PROVIDE_BAT_ATTRS(0) +PROVIDE_BAT_ATTRS(1) + +/* List of attribute groups */ + +static struct attribute_group *attr_groups[] = { + &tp_root_attribute_group, + &tp_bat0_attribute_group, + &tp_bat1_attribute_group, + NULL +}; + + +/********************************************************************* + * Init and cleanup + */ + +static struct attribute_group **next_attr_group; /* next to register */ + +static int __init tp_init(void) +{ + int ret; + printk(KERN_INFO "tp_smapi " TP_VERSION " loading...\n"); + + ret = find_smapi_port(); + if (ret < 0) + goto err; + else + smapi_port = ret; + + if (!request_region(smapi_port, 1, "smapi")) { + printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", + smapi_port); + ret = -ENXIO; + goto err; + } + + if (!request_region(SMAPI_PORT2, 1, "smapi")) { + printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", + SMAPI_PORT2); + ret = -ENXIO; + goto err_port1; + } + + ret = platform_driver_register(&tp_driver); + if (ret) + goto err_port2; + + pdev = platform_device_alloc("smapi", -1); + if (!pdev) { + ret = -ENOMEM; + goto err_driver; + } + + ret = platform_device_add(pdev); + if (ret) + goto err_device_free; + + for (next_attr_group = attr_groups; *next_attr_group; + ++next_attr_group) { + ret = sysfs_create_group(&pdev->dev.kobj, *next_attr_group); + if (ret) + goto err_attr; + } + + printk(KERN_INFO "tp_smapi successfully loaded (smapi_port=0x%x).\n", + smapi_port); + return 0; + +err_attr: + while (--next_attr_group >= attr_groups) + sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); + platform_device_unregister(pdev); +err_device_free: + platform_device_put(pdev); +err_driver: + platform_driver_unregister(&tp_driver); +err_port2: + release_region(SMAPI_PORT2, 1); +err_port1: + release_region(smapi_port, 1); +err: + printk(KERN_ERR "tp_smapi init failed (ret=%d)!\n", ret); + return ret; +} + +static void __exit tp_exit(void) +{ + while (next_attr_group && --next_attr_group >= attr_groups) + sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); + platform_device_unregister(pdev); + platform_driver_unregister(&tp_driver); + release_region(SMAPI_PORT2, 1); + if (smapi_port) + release_region(smapi_port, 1); + + printk(KERN_INFO "tp_smapi unloaded.\n"); +} + +module_init(tp_init); +module_exit(tp_exit); diff -Nur linux-4.2/drivers/staging/Kconfig linux-4.2-pck/drivers/staging/Kconfig --- linux-4.2/drivers/staging/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/staging/Kconfig 2015-09-08 00:48:22.225030705 -0300 @@ -112,4 +112,6 @@ source "drivers/staging/wilc1000/Kconfig" +source "drivers/staging/vhba/Kconfig" + endif # STAGING diff -Nur linux-4.2/drivers/staging/Makefile linux-4.2-pck/drivers/staging/Makefile --- linux-4.2/drivers/staging/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/staging/Makefile 2015-09-08 00:48:22.225030705 -0300 @@ -25,6 +25,7 @@ obj-$(CONFIG_FB_SM7XX) += sm7xxfb/ obj-$(CONFIG_FB_SM750) += sm750fb/ obj-$(CONFIG_FB_XGI) += xgifb/ +obj-$(CONFIG_VHBA) += vhba/ obj-$(CONFIG_USB_EMXX) += emxx_udc/ obj-$(CONFIG_FT1000) += ft1000/ obj-$(CONFIG_SPEAKUP) += speakup/ diff -Nur linux-4.2/drivers/staging/vhba/Kconfig linux-4.2-pck/drivers/staging/vhba/Kconfig --- linux-4.2/drivers/staging/vhba/Kconfig 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/drivers/staging/vhba/Kconfig 2015-09-08 00:48:22.225030705 -0300 @@ -0,0 +1,9 @@ +config VHBA + tristate "Virtual (SCSI) Host Bus Adapter" + depends on SCSI + ---help--- + This is the in-kernel part of CDEmu, a CD/DVD-ROM device + emulator. + + This driver can also be built as a module. If so, the module + will be called vhba. diff -Nur linux-4.2/drivers/staging/vhba/Makefile linux-4.2-pck/drivers/staging/vhba/Makefile --- linux-4.2/drivers/staging/vhba/Makefile 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/drivers/staging/vhba/Makefile 2015-09-08 00:48:22.225030705 -0300 @@ -0,0 +1,4 @@ +VHBA_VERSION := 20140928 + +obj-$(CONFIG_VHBA) += vhba.o +ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror diff -Nur linux-4.2/drivers/staging/vhba/vhba.c linux-4.2-pck/drivers/staging/vhba/vhba.c --- linux-4.2/drivers/staging/vhba/vhba.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/drivers/staging/vhba/vhba.c 2015-09-08 00:48:22.228363880 -0300 @@ -0,0 +1,1071 @@ +/* + * vhba.c + * + * Copyright (C) 2007-2012 Chia-I Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_COMPAT +#include +#endif +#include +#include +#include +#include +#include + +/* scatterlist.page_link and sg_page() were introduced in 2.6.24 */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24) +#define USE_SG_PAGE +#include +#endif + +MODULE_AUTHOR("Chia-I Wu"); +MODULE_VERSION(VHBA_VERSION); +MODULE_DESCRIPTION("Virtual SCSI HBA"); +MODULE_LICENSE("GPL"); + +#ifdef DEBUG +#define DPRINTK(fmt, args...) printk(KERN_DEBUG "%s: " fmt, __FUNCTION__, ## args) +#else +#define DPRINTK(fmt, args...) +#endif + +/* scmd_dbg was introduced in 3.15 */ +#ifndef scmd_dbg +#define scmd_dbg(scmd, fmt, a...) \ + dev_dbg(&(scmd)->device->sdev_gendev, fmt, ##a) +#endif + +#ifndef scmd_warn +#define scmd_warn(scmd, fmt, a...) \ + dev_warn(&(scmd)->device->sdev_gendev, fmt, ##a) +#endif + +#define VHBA_MAX_SECTORS_PER_IO 256 +#define VHBA_MAX_ID 32 +#define VHBA_CAN_QUEUE 32 +#define VHBA_INVALID_ID VHBA_MAX_ID + +#define DATA_TO_DEVICE(dir) ((dir) == DMA_TO_DEVICE || (dir) == DMA_BIDIRECTIONAL) +#define DATA_FROM_DEVICE(dir) ((dir) == DMA_FROM_DEVICE || (dir) == DMA_BIDIRECTIONAL) + + +/* SCSI macros were introduced in 2.6.23 */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23) +#define scsi_sg_count(cmd) ((cmd)->use_sg) +#define scsi_sglist(cmd) ((cmd)->request_buffer) +#define scsi_bufflen(cmd) ((cmd)->request_bufflen) +#define scsi_set_resid(cmd, to_read) {(cmd)->resid = (to_read);} +#endif + +/* 1-argument form of k[un]map_atomic was introduced in 2.6.37-rc1; + 2-argument form was deprecated in 3.4-rc1 */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 37) +#define vhba_kmap_atomic kmap_atomic +#define vhba_kunmap_atomic kunmap_atomic +#else +#define vhba_kmap_atomic(page) kmap_atomic(page, KM_USER0) +#define vhba_kunmap_atomic(page) kunmap_atomic(page, KM_USER0) +#endif + + +enum vhba_req_state { + VHBA_REQ_FREE, + VHBA_REQ_PENDING, + VHBA_REQ_READING, + VHBA_REQ_SENT, + VHBA_REQ_WRITING, +}; + +struct vhba_command { + struct scsi_cmnd *cmd; + int status; + struct list_head entry; +}; + +struct vhba_device { + uint id; + spinlock_t cmd_lock; + struct list_head cmd_list; + wait_queue_head_t cmd_wq; + atomic_t refcnt; +}; + +struct vhba_host { + struct Scsi_Host *shost; + spinlock_t cmd_lock; + int cmd_next; + struct vhba_command commands[VHBA_CAN_QUEUE]; + spinlock_t dev_lock; + struct vhba_device *devices[VHBA_MAX_ID]; + int num_devices; + DECLARE_BITMAP(chgmap, VHBA_MAX_ID); + int chgtype[VHBA_MAX_ID]; + struct work_struct scan_devices; +}; + +#define MAX_COMMAND_SIZE 16 + +struct vhba_request { + __u32 tag; + __u32 lun; + __u8 cdb[MAX_COMMAND_SIZE]; + __u8 cdb_len; + __u32 data_len; +}; + +struct vhba_response { + __u32 tag; + __u32 status; + __u32 data_len; +}; + +static struct vhba_command *vhba_alloc_command (void); +static void vhba_free_command (struct vhba_command *vcmd); + +static struct platform_device vhba_platform_device; + +static struct vhba_device *vhba_device_alloc (void) +{ + struct vhba_device *vdev; + + vdev = kzalloc(sizeof(struct vhba_device), GFP_KERNEL); + if (!vdev) { + return NULL; + } + + vdev->id = VHBA_INVALID_ID; + spin_lock_init(&vdev->cmd_lock); + INIT_LIST_HEAD(&vdev->cmd_list); + init_waitqueue_head(&vdev->cmd_wq); + atomic_set(&vdev->refcnt, 1); + + return vdev; +} + +static void vhba_device_put (struct vhba_device *vdev) +{ + if (atomic_dec_and_test(&vdev->refcnt)) { + kfree(vdev); + } +} + +static struct vhba_device *vhba_device_get (struct vhba_device *vdev) +{ + atomic_inc(&vdev->refcnt); + + return vdev; +} + +static int vhba_device_queue (struct vhba_device *vdev, struct scsi_cmnd *cmd) +{ + struct vhba_command *vcmd; + unsigned long flags; + + vcmd = vhba_alloc_command(); + if (!vcmd) { + return SCSI_MLQUEUE_HOST_BUSY; + } + + vcmd->cmd = cmd; + + spin_lock_irqsave(&vdev->cmd_lock, flags); + list_add_tail(&vcmd->entry, &vdev->cmd_list); + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + wake_up_interruptible(&vdev->cmd_wq); + + return 0; +} + +static int vhba_device_dequeue (struct vhba_device *vdev, struct scsi_cmnd *cmd) +{ + struct vhba_command *vcmd; + int retval; + unsigned long flags; + + spin_lock_irqsave(&vdev->cmd_lock, flags); + list_for_each_entry(vcmd, &vdev->cmd_list, entry) { + if (vcmd->cmd == cmd) { + list_del_init(&vcmd->entry); + break; + } + } + + /* command not found */ + if (&vcmd->entry == &vdev->cmd_list) { + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + return SUCCESS; + } + + while (vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING) { + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + scmd_dbg(cmd, "wait for I/O before aborting\n"); + schedule_timeout(1); + spin_lock_irqsave(&vdev->cmd_lock, flags); + } + + retval = (vcmd->status == VHBA_REQ_SENT) ? FAILED : SUCCESS; + + vhba_free_command(vcmd); + + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + return retval; +} + +static inline void vhba_scan_devices_add (struct vhba_host *vhost, int id) +{ + struct scsi_device *sdev; + + sdev = scsi_device_lookup(vhost->shost, 0, id, 0); + if (!sdev) { + scsi_add_device(vhost->shost, 0, id, 0); + } else { + dev_warn(&vhost->shost->shost_gendev, "tried to add an already-existing device 0:%d:0!\n", id); + scsi_device_put(sdev); + } +} + +static inline void vhba_scan_devices_remove (struct vhba_host *vhost, int id) +{ + struct scsi_device *sdev; + + sdev = scsi_device_lookup(vhost->shost, 0, id, 0); + if (sdev) { + scsi_remove_device(sdev); + scsi_device_put(sdev); + } else { + dev_warn(&vhost->shost->shost_gendev, "tried to remove non-existing device 0:%d:0!\n", id); + } +} + +static void vhba_scan_devices (struct work_struct *work) +{ + struct vhba_host *vhost = container_of(work, struct vhba_host, scan_devices); + unsigned long flags; + int id, change, exists; + + while (1) { + spin_lock_irqsave(&vhost->dev_lock, flags); + + id = find_first_bit(vhost->chgmap, vhost->shost->max_id); + if (id >= vhost->shost->max_id) { + spin_unlock_irqrestore(&vhost->dev_lock, flags); + break; + } + change = vhost->chgtype[id]; + exists = vhost->devices[id] != NULL; + + vhost->chgtype[id] = 0; + clear_bit(id, vhost->chgmap); + + spin_unlock_irqrestore(&vhost->dev_lock, flags); + + if (change < 0) { + dev_dbg(&vhost->shost->shost_gendev, "trying to remove target 0:%d:0\n", id); + vhba_scan_devices_remove(vhost, id); + } else if (change > 0) { + dev_dbg(&vhost->shost->shost_gendev, "trying to add target 0:%d:0\n", id); + vhba_scan_devices_add(vhost, id); + } else { + /* quick sequence of add/remove or remove/add; we determine + which one it was by checking if device structure exists */ + if (exists) { + /* remove followed by add: remove and (re)add */ + dev_dbg(&vhost->shost->shost_gendev, "trying to (re)add target 0:%d:0\n", id); + vhba_scan_devices_remove(vhost, id); + vhba_scan_devices_add(vhost, id); + } else { + /* add followed by remove: no-op */ + dev_dbg(&vhost->shost->shost_gendev, "no-op for target 0:%d:0\n", id); + } + } + } +} + +static int vhba_add_device (struct vhba_device *vdev) +{ + struct vhba_host *vhost; + int i; + unsigned long flags; + + vhost = platform_get_drvdata(&vhba_platform_device); + + vhba_device_get(vdev); + + spin_lock_irqsave(&vhost->dev_lock, flags); + if (vhost->num_devices >= vhost->shost->max_id) { + spin_unlock_irqrestore(&vhost->dev_lock, flags); + vhba_device_put(vdev); + return -EBUSY; + } + + for (i = 0; i < vhost->shost->max_id; i++) { + if (vhost->devices[i] == NULL) { + vdev->id = i; + vhost->devices[i] = vdev; + vhost->num_devices++; + set_bit(vdev->id, vhost->chgmap); + vhost->chgtype[vdev->id]++; + break; + } + } + spin_unlock_irqrestore(&vhost->dev_lock, flags); + + schedule_work(&vhost->scan_devices); + + return 0; +} + +static int vhba_remove_device (struct vhba_device *vdev) +{ + struct vhba_host *vhost; + unsigned long flags; + + vhost = platform_get_drvdata(&vhba_platform_device); + + spin_lock_irqsave(&vhost->dev_lock, flags); + set_bit(vdev->id, vhost->chgmap); + vhost->chgtype[vdev->id]--; + vhost->devices[vdev->id] = NULL; + vhost->num_devices--; + vdev->id = VHBA_INVALID_ID; + spin_unlock_irqrestore(&vhost->dev_lock, flags); + + vhba_device_put(vdev); + + schedule_work(&vhost->scan_devices); + + return 0; +} + +static struct vhba_device *vhba_lookup_device (int id) +{ + struct vhba_host *vhost; + struct vhba_device *vdev = NULL; + unsigned long flags; + + vhost = platform_get_drvdata(&vhba_platform_device); + + if (likely(id < vhost->shost->max_id)) { + spin_lock_irqsave(&vhost->dev_lock, flags); + vdev = vhost->devices[id]; + if (vdev) { + vdev = vhba_device_get(vdev); + } + + spin_unlock_irqrestore(&vhost->dev_lock, flags); + } + + return vdev; +} + +static struct vhba_command *vhba_alloc_command (void) +{ + struct vhba_host *vhost; + struct vhba_command *vcmd; + unsigned long flags; + int i; + + vhost = platform_get_drvdata(&vhba_platform_device); + + spin_lock_irqsave(&vhost->cmd_lock, flags); + + vcmd = vhost->commands + vhost->cmd_next++; + if (vcmd->status != VHBA_REQ_FREE) { + for (i = 0; i < vhost->shost->can_queue; i++) { + vcmd = vhost->commands + i; + + if (vcmd->status == VHBA_REQ_FREE) { + vhost->cmd_next = i + 1; + break; + } + } + + if (i == vhost->shost->can_queue) { + vcmd = NULL; + } + } + + if (vcmd) { + vcmd->status = VHBA_REQ_PENDING; + } + + vhost->cmd_next %= vhost->shost->can_queue; + + spin_unlock_irqrestore(&vhost->cmd_lock, flags); + + return vcmd; +} + +static void vhba_free_command (struct vhba_command *vcmd) +{ + struct vhba_host *vhost; + unsigned long flags; + + vhost = platform_get_drvdata(&vhba_platform_device); + + spin_lock_irqsave(&vhost->cmd_lock, flags); + vcmd->status = VHBA_REQ_FREE; + spin_unlock_irqrestore(&vhost->cmd_lock, flags); +} + +static int vhba_queuecommand_lck (struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)) +{ + struct vhba_device *vdev; + int retval; + + scmd_dbg(cmd, "queue %lu\n", cmd->serial_number); + + vdev = vhba_lookup_device(cmd->device->id); + if (!vdev) { + scmd_dbg(cmd, "no such device\n"); + + cmd->result = DID_NO_CONNECT << 16; + done(cmd); + + return 0; + } + + cmd->scsi_done = done; + retval = vhba_device_queue(vdev, cmd); + + vhba_device_put(vdev); + + return retval; +} + +#ifdef DEF_SCSI_QCMD +DEF_SCSI_QCMD(vhba_queuecommand) +#else +#define vhba_queuecommand vhba_queuecommand_lck +#endif + +static int vhba_abort (struct scsi_cmnd *cmd) +{ + struct vhba_device *vdev; + int retval = SUCCESS; + + scmd_warn(cmd, "abort %lu\n", cmd->serial_number); + + vdev = vhba_lookup_device(cmd->device->id); + if (vdev) { + retval = vhba_device_dequeue(vdev, cmd); + vhba_device_put(vdev); + } else { + cmd->result = DID_NO_CONNECT << 16; + } + + return retval; +} + +static struct scsi_host_template vhba_template = { + .module = THIS_MODULE, + .name = "vhba", + .proc_name = "vhba", + .queuecommand = vhba_queuecommand, + .eh_abort_handler = vhba_abort, + .can_queue = VHBA_CAN_QUEUE, + .this_id = -1, + .cmd_per_lun = 1, + .max_sectors = VHBA_MAX_SECTORS_PER_IO, + .sg_tablesize = 256, +}; + +static ssize_t do_request (struct scsi_cmnd *cmd, char __user *buf, size_t buf_len) +{ + struct vhba_request vreq; + ssize_t ret; + + scmd_dbg(cmd, "request %lu, cdb 0x%x, bufflen %d, use_sg %d\n", + cmd->serial_number, cmd->cmnd[0], scsi_bufflen(cmd), scsi_sg_count(cmd)); + + ret = sizeof(vreq); + if (DATA_TO_DEVICE(cmd->sc_data_direction)) { + ret += scsi_bufflen(cmd); + } + + if (ret > buf_len) { + scmd_warn(cmd, "buffer too small (%zd < %zd) for a request\n", buf_len, ret); + return -EIO; + } + + vreq.tag = cmd->serial_number; + vreq.lun = cmd->device->lun; + memcpy(vreq.cdb, cmd->cmnd, MAX_COMMAND_SIZE); + vreq.cdb_len = cmd->cmd_len; + vreq.data_len = scsi_bufflen(cmd); + + if (copy_to_user(buf, &vreq, sizeof(vreq))) { + return -EFAULT; + } + + if (DATA_TO_DEVICE(cmd->sc_data_direction) && vreq.data_len) { + buf += sizeof(vreq); + + if (scsi_sg_count(cmd)) { + unsigned char buf_stack[64]; + unsigned char *kaddr, *uaddr, *kbuf; + struct scatterlist *sg = scsi_sglist(cmd); + int i; + + uaddr = (unsigned char *) buf; + + if (vreq.data_len > 64) { + kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + } else { + kbuf = buf_stack; + } + + for (i = 0; i < scsi_sg_count(cmd); i++) { + size_t len = sg[i].length; + +#ifdef USE_SG_PAGE + kaddr = vhba_kmap_atomic(sg_page(&sg[i])); +#else + kaddr = vhba_kmap_atomic(sg[i].page); +#endif + memcpy(kbuf, kaddr + sg[i].offset, len); + vhba_kunmap_atomic(kaddr); + + if (copy_to_user(uaddr, kbuf, len)) { + if (kbuf != buf_stack) { + kfree(kbuf); + } + return -EFAULT; + } + uaddr += len; + } + + if (kbuf != buf_stack) { + kfree(kbuf); + } + } else { + if (copy_to_user(buf, scsi_sglist(cmd), vreq.data_len)) { + return -EFAULT; + } + } + } + + return ret; +} + +static ssize_t do_response (struct scsi_cmnd *cmd, const char __user *buf, size_t buf_len, struct vhba_response *res) +{ + ssize_t ret = 0; + + scmd_dbg(cmd, "response %lu, status %x, data len %d, use_sg %d\n", + cmd->serial_number, res->status, res->data_len, scsi_sg_count(cmd)); + + if (res->status) { + unsigned char sense_stack[SCSI_SENSE_BUFFERSIZE]; + + if (res->data_len > SCSI_SENSE_BUFFERSIZE) { + scmd_warn(cmd, "truncate sense (%d < %d)", SCSI_SENSE_BUFFERSIZE, res->data_len); + res->data_len = SCSI_SENSE_BUFFERSIZE; + } + + /* Copy via temporary buffer on stack in order to avoid problems + with PAX on grsecurity-enabled kernels */ + if (copy_from_user(sense_stack, buf, res->data_len)) { + return -EFAULT; + } + memcpy(cmd->sense_buffer, sense_stack, res->data_len); + + cmd->result = res->status; + + ret += res->data_len; + } else if (DATA_FROM_DEVICE(cmd->sc_data_direction) && scsi_bufflen(cmd)) { + size_t to_read; + + if (res->data_len > scsi_bufflen(cmd)) { + scmd_warn(cmd, "truncate data (%d < %d)\n", scsi_bufflen(cmd), res->data_len); + res->data_len = scsi_bufflen(cmd); + } + + to_read = res->data_len; + + if (scsi_sg_count(cmd)) { + unsigned char buf_stack[64]; + unsigned char *kaddr, *uaddr, *kbuf; + struct scatterlist *sg = scsi_sglist(cmd); + int i; + + uaddr = (unsigned char *)buf; + + if (res->data_len > 64) { + kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + } else { + kbuf = buf_stack; + } + + for (i = 0; i < scsi_sg_count(cmd); i++) { + size_t len = (sg[i].length < to_read) ? sg[i].length : to_read; + + if (copy_from_user(kbuf, uaddr, len)) { + if (kbuf != buf_stack) { + kfree(kbuf); + } + return -EFAULT; + } + uaddr += len; + +#ifdef USE_SG_PAGE + kaddr = vhba_kmap_atomic(sg_page(&sg[i])); +#else + kaddr = vhba_kmap_atomic(sg[i].page); +#endif + memcpy(kaddr + sg[i].offset, kbuf, len); + vhba_kunmap_atomic(kaddr); + + to_read -= len; + if (to_read == 0) { + break; + } + } + + if (kbuf != buf_stack) { + kfree(kbuf); + } + } else { + if (copy_from_user(scsi_sglist(cmd), buf, res->data_len)) { + return -EFAULT; + } + + to_read -= res->data_len; + } + + scsi_set_resid(cmd, to_read); + + ret += res->data_len - to_read; + } + + return ret; +} + +static inline struct vhba_command *next_command (struct vhba_device *vdev) +{ + struct vhba_command *vcmd; + + list_for_each_entry(vcmd, &vdev->cmd_list, entry) { + if (vcmd->status == VHBA_REQ_PENDING) { + break; + } + } + + if (&vcmd->entry == &vdev->cmd_list) { + vcmd = NULL; + } + + return vcmd; +} + +static inline struct vhba_command *match_command (struct vhba_device *vdev, u32 tag) +{ + struct vhba_command *vcmd; + + list_for_each_entry(vcmd, &vdev->cmd_list, entry) { + if (vcmd->cmd->serial_number == tag) { + break; + } + } + + if (&vcmd->entry == &vdev->cmd_list) { + vcmd = NULL; + } + + return vcmd; +} + +static struct vhba_command *wait_command (struct vhba_device *vdev, unsigned long flags) +{ + struct vhba_command *vcmd; + DEFINE_WAIT(wait); + + while (!(vcmd = next_command(vdev))) { + if (signal_pending(current)) { + break; + } + + prepare_to_wait(&vdev->cmd_wq, &wait, TASK_INTERRUPTIBLE); + + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + schedule(); + + spin_lock_irqsave(&vdev->cmd_lock, flags); + } + + finish_wait(&vdev->cmd_wq, &wait); + if (vcmd) { + vcmd->status = VHBA_REQ_READING; + } + + return vcmd; +} + +static ssize_t vhba_ctl_read (struct file *file, char __user *buf, size_t buf_len, loff_t *offset) +{ + struct vhba_device *vdev; + struct vhba_command *vcmd; + ssize_t ret; + unsigned long flags; + + vdev = file->private_data; + + /* Get next command */ + if (file->f_flags & O_NONBLOCK) { + /* Non-blocking variant */ + spin_lock_irqsave(&vdev->cmd_lock, flags); + vcmd = next_command(vdev); + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + if (!vcmd) { + return -EWOULDBLOCK; + } + } else { + /* Blocking variant */ + spin_lock_irqsave(&vdev->cmd_lock, flags); + vcmd = wait_command(vdev, flags); + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + if (!vcmd) { + return -ERESTARTSYS; + } + } + + ret = do_request(vcmd->cmd, buf, buf_len); + + spin_lock_irqsave(&vdev->cmd_lock, flags); + if (ret >= 0) { + vcmd->status = VHBA_REQ_SENT; + *offset += ret; + } else { + vcmd->status = VHBA_REQ_PENDING; + } + + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + return ret; +} + +static ssize_t vhba_ctl_write (struct file *file, const char __user *buf, size_t buf_len, loff_t *offset) +{ + struct vhba_device *vdev; + struct vhba_command *vcmd; + struct vhba_response res; + ssize_t ret; + unsigned long flags; + + if (buf_len < sizeof(res)) { + return -EIO; + } + + if (copy_from_user(&res, buf, sizeof(res))) { + return -EFAULT; + } + + vdev = file->private_data; + + spin_lock_irqsave(&vdev->cmd_lock, flags); + vcmd = match_command(vdev, res.tag); + if (!vcmd || vcmd->status != VHBA_REQ_SENT) { + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + DPRINTK("not expecting response\n"); + return -EIO; + } + vcmd->status = VHBA_REQ_WRITING; + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + ret = do_response(vcmd->cmd, buf + sizeof(res), buf_len - sizeof(res), &res); + + spin_lock_irqsave(&vdev->cmd_lock, flags); + if (ret >= 0) { + vcmd->cmd->scsi_done(vcmd->cmd); + ret += sizeof(res); + + /* don't compete with vhba_device_dequeue */ + if (!list_empty(&vcmd->entry)) { + list_del_init(&vcmd->entry); + vhba_free_command(vcmd); + } + } else { + vcmd->status = VHBA_REQ_SENT; + } + + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + return ret; +} + +static long vhba_ctl_ioctl (struct file *file, unsigned int cmd, unsigned long arg) +{ + struct vhba_device *vdev = file->private_data; + struct vhba_host *vhost; + struct scsi_device *sdev; + + switch (cmd) { + case 0xBEEF001: { + vhost = platform_get_drvdata(&vhba_platform_device); + sdev = scsi_device_lookup(vhost->shost, 0, vdev->id, 0); + + if (sdev) { + int id[4] = { + sdev->host->host_no, + sdev->channel, + sdev->id, + sdev->lun + }; + + scsi_device_put(sdev); + + if (copy_to_user((void *)arg, id, sizeof(id))) { + return -EFAULT; + } + + return 0; + } else { + return -ENODEV; + } + } + } + + return -ENOTTY; +} + +#ifdef CONFIG_COMPAT +static long vhba_ctl_compat_ioctl (struct file *file, unsigned int cmd, unsigned long arg) +{ + unsigned long compat_arg = (unsigned long)compat_ptr(arg); + return vhba_ctl_ioctl(file, cmd, compat_arg); +} +#endif + +static unsigned int vhba_ctl_poll (struct file *file, poll_table *wait) +{ + struct vhba_device *vdev = file->private_data; + unsigned int mask = 0; + unsigned long flags; + + poll_wait(file, &vdev->cmd_wq, wait); + + spin_lock_irqsave(&vdev->cmd_lock, flags); + if (next_command(vdev)) { + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + return mask; +} + +static int vhba_ctl_open (struct inode *inode, struct file *file) +{ + struct vhba_device *vdev; + int retval; + + DPRINTK("open\n"); + + /* check if vhba is probed */ + if (!platform_get_drvdata(&vhba_platform_device)) { + return -ENODEV; + } + + vdev = vhba_device_alloc(); + if (!vdev) { + return -ENOMEM; + } + + if (!(retval = vhba_add_device(vdev))) { + file->private_data = vdev; + } + + vhba_device_put(vdev); + + return retval; +} + +static int vhba_ctl_release (struct inode *inode, struct file *file) +{ + struct vhba_device *vdev; + struct vhba_command *vcmd; + unsigned long flags; + + DPRINTK("release\n"); + + vdev = file->private_data; + + vhba_device_get(vdev); + vhba_remove_device(vdev); + + spin_lock_irqsave(&vdev->cmd_lock, flags); + list_for_each_entry(vcmd, &vdev->cmd_list, entry) { + WARN_ON(vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING); + + scmd_warn(vcmd->cmd, "device released with command %lu\n", vcmd->cmd->serial_number); + vcmd->cmd->result = DID_NO_CONNECT << 16; + vcmd->cmd->scsi_done(vcmd->cmd); + + vhba_free_command(vcmd); + } + INIT_LIST_HEAD(&vdev->cmd_list); + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + vhba_device_put(vdev); + + return 0; +} + +static struct file_operations vhba_ctl_fops = { + .owner = THIS_MODULE, + .open = vhba_ctl_open, + .release = vhba_ctl_release, + .read = vhba_ctl_read, + .write = vhba_ctl_write, + .poll = vhba_ctl_poll, + .unlocked_ioctl = vhba_ctl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vhba_ctl_compat_ioctl, +#endif +}; + +static struct miscdevice vhba_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "vhba_ctl", + .fops = &vhba_ctl_fops, +}; + +static int vhba_probe (struct platform_device *pdev) +{ + struct Scsi_Host *shost; + struct vhba_host *vhost; + int i; + + shost = scsi_host_alloc(&vhba_template, sizeof(struct vhba_host)); + if (!shost) { + return -ENOMEM; + } + + shost->max_id = VHBA_MAX_ID; + /* we don't support lun > 0 */ + shost->max_lun = 1; + shost->max_cmd_len = MAX_COMMAND_SIZE; + + vhost = (struct vhba_host *)shost->hostdata; + memset(vhost, 0, sizeof(*vhost)); + + vhost->shost = shost; + vhost->num_devices = 0; + spin_lock_init(&vhost->dev_lock); + spin_lock_init(&vhost->cmd_lock); + INIT_WORK(&vhost->scan_devices, vhba_scan_devices); + vhost->cmd_next = 0; + for (i = 0; i < vhost->shost->can_queue; i++) { + vhost->commands[i].status = VHBA_REQ_FREE; + } + + platform_set_drvdata(pdev, vhost); + + if (scsi_add_host(shost, &pdev->dev)) { + scsi_host_put(shost); + return -ENOMEM; + } + + return 0; +} + +static int vhba_remove (struct platform_device *pdev) +{ + struct vhba_host *vhost; + struct Scsi_Host *shost; + + vhost = platform_get_drvdata(pdev); + shost = vhost->shost; + + scsi_remove_host(shost); + scsi_host_put(shost); + + return 0; +} + +static void vhba_release (struct device * dev) +{ + return; +} + +static struct platform_device vhba_platform_device = { + .name = "vhba", + .id = -1, + .dev = { + .release = vhba_release, + }, +}; + +static struct platform_driver vhba_platform_driver = { + .driver = { + .owner = THIS_MODULE, + .name = "vhba", + }, + .probe = vhba_probe, + .remove = vhba_remove, +}; + +static int __init vhba_init (void) +{ + int ret; + + ret = platform_device_register(&vhba_platform_device); + if (ret < 0) { + return ret; + } + + ret = platform_driver_register(&vhba_platform_driver); + if (ret < 0) { + platform_device_unregister(&vhba_platform_device); + return ret; + } + + ret = misc_register(&vhba_miscdev); + if (ret < 0) { + platform_driver_unregister(&vhba_platform_driver); + platform_device_unregister(&vhba_platform_device); + return ret; + } + + return 0; +} + +static void __exit vhba_exit(void) +{ + misc_deregister(&vhba_miscdev); + platform_driver_unregister(&vhba_platform_driver); + platform_device_unregister(&vhba_platform_device); +} + +module_init(vhba_init); +module_exit(vhba_exit); + diff -Nur linux-4.2/drivers/tty/Kconfig linux-4.2-pck/drivers/tty/Kconfig --- linux-4.2/drivers/tty/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/tty/Kconfig 2015-09-08 00:48:22.228363880 -0300 @@ -75,6 +75,124 @@ def_bool y depends on VT_CONSOLE && PM_SLEEP +menuconfig VT_CKO + bool "Colored kernel message output" + depends on VT_CONSOLE + ---help--- + This option enables kernel messages to be emitted in + colors other than the default. + + The color value you need to enter is composed (OR-ed) + of a foreground and a background color. + + Foreground: + 0x00 = black, 0x08 = dark gray, + 0x01 = red, 0x09 = light red, + 0x02 = green, 0x0A = light green, + 0x03 = brown, 0x0B = yellow, + 0x04 = blue, 0x0C = light blue, + 0x05 = magenta, 0x0D = light magenta, + 0x06 = cyan, 0x0E = light cyan, + 0x07 = gray, 0x0F = white, + + (Foreground colors 0x08 to 0x0F do not work when a VGA + console font with 512 glyphs is used.) + + Background: + 0x00 = black, 0x40 = blue, + 0x10 = red, 0x50 = magenta, + 0x20 = green, 0x60 = cyan, + 0x30 = brown, 0x70 = gray, + + For example, 0x1F would yield white on red. + + If unsure, say N. + +config VT_PRINTK_EMERG_COLOR + hex "Emergency messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel emergency messages will + be printed to the console. + +config VT_PRINTK_ALERT_COLOR + hex "Alert messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel alert messages will + be printed to the console. + +config VT_PRINTK_CRIT_COLOR + hex "Critical messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel critical messages will + be printed to the console. + +config VT_PRINTK_ERR_COLOR + hex "Error messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel error messages will + be printed to the console. + +config VT_PRINTK_WARNING_COLOR + hex "Warning messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel warning messages will + be printed to the console. + +config VT_PRINTK_NOTICE_COLOR + hex "Notice messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel notice messages will + be printed to the console. + +config VT_PRINTK_INFO_COLOR + hex "Information messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel information messages will + be printed to the console. + +config VT_PRINTK_DEBUG_COLOR + hex "Debug messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel debug messages will + be printed to the console. + +config NR_TTY_DEVICES + int "Maximum tty device number" + depends on VT + range 12 63 + default 63 + ---help--- + This option is used to change the number of tty devices in /dev. + The default value is 63. The lowest number you can set is 12, + 63 is also the upper limit so we don't overrun the serial + consoles. + + If unsure, say 63. + config HW_CONSOLE bool depends on VT && !UML diff -Nur linux-4.2/drivers/tty/serial/8250/8250_core.c linux-4.2-pck/drivers/tty/serial/8250/8250_core.c --- linux-4.2/drivers/tty/serial/8250/8250_core.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/tty/serial/8250/8250_core.c 2015-09-08 00:48:22.228363880 -0300 @@ -3339,7 +3339,7 @@ * The console_lock must be held when we get here. */ static void serial8250_console_write(struct uart_8250_port *up, const char *s, - unsigned int count) + unsigned int count, unsigned int loglevel) { struct uart_port *port = &up->port; unsigned long flags; @@ -3413,11 +3413,11 @@ } static void univ8250_console_write(struct console *co, const char *s, - unsigned int count) + unsigned int count, unsigned int loglevel) { struct uart_8250_port *up = &serial8250_ports[co->index]; - serial8250_console_write(up, s, count); + serial8250_console_write(up, s, count, loglevel); } static unsigned int probe_baud(struct uart_port *port) diff -Nur linux-4.2/drivers/tty/serial/8250/8250_early.c linux-4.2-pck/drivers/tty/serial/8250/8250_early.c --- linux-4.2/drivers/tty/serial/8250/8250_early.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/tty/serial/8250/8250_early.c 2015-09-08 00:48:22.228363880 -0300 @@ -90,7 +90,7 @@ } static void __init early_serial8250_write(struct console *console, - const char *s, unsigned int count) + const char *s, unsigned int count, unsigned int loglevel) { struct earlycon_device *device = console->data; struct uart_port *port = &device->port; diff -Nur linux-4.2/drivers/tty/vt/vt.c linux-4.2-pck/drivers/tty/vt/vt.c --- linux-4.2/drivers/tty/vt/vt.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/tty/vt/vt.c 2015-09-08 00:48:22.228363880 -0300 @@ -71,6 +71,7 @@ */ #include +#include #include #include #include @@ -2531,16 +2532,44 @@ return kmsg_con; } +#ifdef CONFIG_VT_CKO +static unsigned int printk_color[8] __read_mostly = { + CONFIG_VT_PRINTK_EMERG_COLOR, /* KERN_EMERG */ + CONFIG_VT_PRINTK_ALERT_COLOR, /* KERN_ALERT */ + CONFIG_VT_PRINTK_CRIT_COLOR, /* KERN_CRIT */ + CONFIG_VT_PRINTK_ERR_COLOR, /* KERN_ERR */ + CONFIG_VT_PRINTK_WARNING_COLOR, /* KERN_WARNING */ + CONFIG_VT_PRINTK_NOTICE_COLOR, /* KERN_NOTICE */ + CONFIG_VT_PRINTK_INFO_COLOR, /* KERN_INFO */ + CONFIG_VT_PRINTK_DEBUG_COLOR, /* KERN_DEBUG */ +}; +module_param_array(printk_color, uint, NULL, S_IRUGO | S_IWUSR); + +static inline void vc_set_color(struct vc_data *vc, unsigned char color) +{ + vc->vc_color = color_table[color & 0xF] | + (color_table[(color >> 4) & 0x7] << 4) | + (color & 0x80); + update_attr(vc); +} +#else +static unsigned int printk_color[8]; +static inline void vc_set_color(const struct vc_data *vc, unsigned char c) +{ +} +#endif + /* * Console on virtual terminal * * The console must be locked when we get here. */ -static void vt_console_print(struct console *co, const char *b, unsigned count) +static void vt_console_print(struct console *co, const char *b, unsigned count, + unsigned int loglevel) { struct vc_data *vc = vc_cons[fg_console].d; - unsigned char c; + unsigned char current_color, c; static DEFINE_SPINLOCK(printing_lock); const ushort *start; ushort cnt = 0; @@ -2576,11 +2605,20 @@ start = (ushort *)vc->vc_pos; + /* + * We always get a valid loglevel - <8> and "no level" is transformed + * to <4> in the typical kernel. + */ + current_color = printk_color[loglevel]; + vc_set_color(vc, current_color); + + /* Contrived structure to try to emulate original need_wrap behaviour * Problems caused when we have need_wrap set on '\n' character */ while (count--) { c = *b++; if (c == 10 || c == 13 || c == 8 || vc->vc_need_wrap) { + vc_set_color(vc, vc->vc_def_color); if (cnt > 0) { if (CON_IS_VISIBLE(vc)) vc->vc_sw->con_putcs(vc, start, cnt, vc->vc_y, vc->vc_x); @@ -2593,6 +2631,7 @@ bs(vc); start = (ushort *)vc->vc_pos; myx = vc->vc_x; + vc_set_color(vc, current_color); continue; } if (c != 13) @@ -2600,6 +2639,7 @@ cr(vc); start = (ushort *)vc->vc_pos; myx = vc->vc_x; + vc_set_color(vc, current_color); if (c == 10 || c == 13) continue; } @@ -2622,6 +2662,7 @@ vc->vc_need_wrap = 1; } } + vc_set_color(vc, vc->vc_def_color); set_cursor(vc); notify_update(vc); diff -Nur linux-4.2/fs/exec.c linux-4.2-pck/fs/exec.c --- linux-4.2/fs/exec.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/fs/exec.c 2015-09-08 00:51:03.460668141 -0300 @@ -19,7 +19,7 @@ * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary - * formats. + * formats. */ #include @@ -56,6 +56,9 @@ #include #include #include +#include + +#include #include #include @@ -784,8 +787,10 @@ if (err) goto exit; - if (name->name[0] != '\0') + if (name->name[0] != '\0') { fsnotify_open(file); + trace_open_exec(name->name); + } out: return file; @@ -1156,6 +1161,7 @@ /* An exec changes our domain. We are no longer part of the thread group */ current->self_exec_id++; + flush_signal_handlers(current, 0); do_close_on_exec(current->files); } diff -Nur linux-4.2/fs/open.c linux-4.2-pck/fs/open.c --- linux-4.2/fs/open.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/fs/open.c 2015-09-08 00:48:22.228363880 -0300 @@ -34,6 +34,9 @@ #include "internal.h" +#define CREATE_TRACE_POINTS +#include + int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { @@ -1029,6 +1032,7 @@ } else { fsnotify_open(f); fd_install(fd, f); + trace_do_sys_open(tmp->name, flags, mode); } } putname(tmp); diff -Nur linux-4.2/fs/proc/meminfo.c linux-4.2-pck/fs/proc/meminfo.c --- linux-4.2/fs/proc/meminfo.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/fs/proc/meminfo.c 2015-09-08 00:51:03.460668141 -0300 @@ -124,6 +124,9 @@ "SUnreclaim: %8lu kB\n" "KernelStack: %8lu kB\n" "PageTables: %8lu kB\n" +#ifdef CONFIG_UKSM + "KsmZeroPages: %8lu kB\n" +#endif #ifdef CONFIG_QUICKLIST "Quicklists: %8lu kB\n" #endif @@ -182,6 +185,9 @@ K(global_page_state(NR_SLAB_UNRECLAIMABLE)), global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, K(global_page_state(NR_PAGETABLE)), +#ifdef CONFIG_UKSM + K(global_page_state(NR_UKSM_ZERO_PAGES)), +#endif #ifdef CONFIG_QUICKLIST K(quicklist_total_size()), #endif diff -Nur linux-4.2/include/acpi/acconfig.h linux-4.2-pck/include/acpi/acconfig.h --- linux-4.2/include/acpi/acconfig.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/acpi/acconfig.h 2015-09-08 00:48:22.228363880 -0300 @@ -112,9 +112,19 @@ * *****************************************************************************/ -/* Version of ACPI supported */ - +/* + * Version of ACPI supported. This is a sad story. Windows reports a _REV of + * 2 regardless of the spec version implemented. Some vendors are using _REV + * as a way to distinguish between Windows and Linux, and are breaking systems + * in the process. We can't guarantee that they'll call _OSI before checking + * _REV, so hardcode this to 2 on x86 systems right now and leave it at the + * appropriate spec value for everybody else. + */ +#ifdef CONFIG_X86 +#define ACPI_CA_SUPPORT_LEVEL 2 +#else #define ACPI_CA_SUPPORT_LEVEL 5 +#endif /* Maximum count for a semaphore object */ diff -Nur linux-4.2/include/asm-generic/pgtable.h linux-4.2-pck/include/asm-generic/pgtable.h --- linux-4.2/include/asm-generic/pgtable.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/asm-generic/pgtable.h 2015-09-08 00:51:03.460668141 -0300 @@ -555,12 +555,25 @@ unsigned long size); #endif +#ifdef CONFIG_UKSM +static inline int is_uksm_zero_pfn(unsigned long pfn) +{ + extern unsigned long uksm_zero_pfn; + return pfn == uksm_zero_pfn; +} +#else +static inline int is_uksm_zero_pfn(unsigned long pfn) +{ + return 0; +} +#endif + #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; - return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); + return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) @@ -569,7 +582,7 @@ static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; - return pfn == zero_pfn; + return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); } static inline unsigned long my_zero_pfn(unsigned long addr) diff -Nur linux-4.2/include/linux/cgroup_subsys.h linux-4.2-pck/include/linux/cgroup_subsys.h --- linux-4.2/include/linux/cgroup_subsys.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/cgroup_subsys.h 2015-09-08 00:48:22.228363880 -0300 @@ -35,6 +35,10 @@ SUBSYS(net_cls) #endif +#if IS_ENABLED(CONFIG_CGROUP_BFQIO) +SUBSYS(bfqio) +#endif + #if IS_ENABLED(CONFIG_CGROUP_PERF) SUBSYS(perf_event) #endif diff -Nur linux-4.2/include/linux/console.h linux-4.2-pck/include/linux/console.h --- linux-4.2/include/linux/console.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/console.h 2015-09-08 00:48:22.228363880 -0300 @@ -119,7 +119,7 @@ struct console { char name[16]; - void (*write)(struct console *, const char *, unsigned); + void (*write)(struct console *, const char *, unsigned, unsigned int); int (*read)(struct console *, char *, unsigned); struct tty_driver *(*device)(struct console *, int *); void (*unblank)(void); diff -Nur linux-4.2/include/linux/ksm.h linux-4.2-pck/include/linux/ksm.h --- linux-4.2/include/linux/ksm.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/ksm.h 2015-09-08 00:51:03.460668141 -0300 @@ -19,21 +19,6 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); -int __ksm_enter(struct mm_struct *mm); -void __ksm_exit(struct mm_struct *mm); - -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -{ - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) - return __ksm_enter(mm); - return 0; -} - -static inline void ksm_exit(struct mm_struct *mm) -{ - if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) - __ksm_exit(mm); -} static inline struct stable_node *page_stable_node(struct page *page) { @@ -64,6 +49,33 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); +#ifdef CONFIG_KSM_LEGACY +int __ksm_enter(struct mm_struct *mm); +void __ksm_exit(struct mm_struct *mm); +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) + return __ksm_enter(mm); + return 0; +} + +static inline void ksm_exit(struct mm_struct *mm) +{ + if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) + __ksm_exit(mm); +} + +#elif defined(CONFIG_UKSM) +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + return 0; +} + +static inline void ksm_exit(struct mm_struct *mm) +{ +} +#endif /* !CONFIG_UKSM */ + #else /* !CONFIG_KSM */ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) @@ -106,4 +118,6 @@ #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ +#include + #endif /* __LINUX_KSM_H */ diff -Nur linux-4.2/include/linux/libata.h linux-4.2-pck/include/linux/libata.h --- linux-4.2/include/linux/libata.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/libata.h 2015-09-08 00:48:22.228363880 -0300 @@ -516,6 +516,7 @@ enum ata_lpm_policy { ATA_LPM_UNKNOWN, ATA_LPM_MAX_POWER, + ATA_LPM_FIRMWARE_DEFAULTS, ATA_LPM_MED_POWER, ATA_LPM_MIN_POWER, }; @@ -728,6 +729,8 @@ int spdn_cnt; /* ering is CLEAR_END, read comment above CLEAR_END */ struct ata_ering ering; + /* Initial DIPM configuration */ + bool init_dipm; }; /* Fields between ATA_DEVICE_CLEAR_BEGIN and ATA_DEVICE_CLEAR_END are @@ -798,6 +801,7 @@ struct ata_device device[ATA_MAX_DEVICES]; + u8 init_lpm; /* initial lpm configuration */ unsigned long last_lpm_change; /* when last LPM change happened */ }; #define ATA_LINK_CLEAR_BEGIN offsetof(struct ata_link, active_tag) diff -Nur linux-4.2/include/linux/mm_types.h linux-4.2-pck/include/linux/mm_types.h --- linux-4.2/include/linux/mm_types.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/mm_types.h 2015-09-08 00:51:03.460668141 -0300 @@ -322,6 +322,9 @@ #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif +#ifdef CONFIG_UKSM + struct vma_slot *uksm_vma_slot; +#endif }; struct core_thread { diff -Nur linux-4.2/include/linux/mmzone.h linux-4.2-pck/include/linux/mmzone.h --- linux-4.2/include/linux/mmzone.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/mmzone.h 2015-09-08 00:51:03.460668141 -0300 @@ -157,6 +157,9 @@ WORKINGSET_NODERECLAIM, NR_ANON_TRANSPARENT_HUGEPAGES, NR_FREE_CMA_PAGES, +#ifdef CONFIG_UKSM + NR_UKSM_ZERO_PAGES, +#endif NR_VM_ZONE_STAT_ITEMS }; /* @@ -872,7 +875,7 @@ } /** - * is_highmem - helper function to quickly check if a struct zone is a + * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone - pointer to struct zone variable diff -Nur linux-4.2/include/linux/sradix-tree.h linux-4.2-pck/include/linux/sradix-tree.h --- linux-4.2/include/linux/sradix-tree.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/linux/sradix-tree.h 2015-09-08 00:51:03.460668141 -0300 @@ -0,0 +1,77 @@ +#ifndef _LINUX_SRADIX_TREE_H +#define _LINUX_SRADIX_TREE_H + + +#define INIT_SRADIX_TREE(root, mask) \ +do { \ + (root)->height = 0; \ + (root)->gfp_mask = (mask); \ + (root)->rnode = NULL; \ +} while (0) + +#define ULONG_BITS (sizeof(unsigned long) * 8) +#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +//#define SRADIX_TREE_MAP_SHIFT 6 +//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) +//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) + +struct sradix_tree_node { + unsigned int height; /* Height from the bottom */ + unsigned int count; + unsigned int fulls; /* Number of full sublevel trees */ + struct sradix_tree_node *parent; + void *stores[0]; +}; + +/* A simple radix tree implementation */ +struct sradix_tree_root { + unsigned int height; + struct sradix_tree_node *rnode; + + /* Where found to have available empty stores in its sublevels */ + struct sradix_tree_node *enter_node; + unsigned int shift; + unsigned int stores_size; + unsigned int mask; + unsigned long min; /* The first hole index */ + unsigned long num; + //unsigned long *height_to_maxindex; + + /* How the node is allocated and freed. */ + struct sradix_tree_node *(*alloc)(void); + void (*free)(struct sradix_tree_node *node); + + /* When a new node is added and removed */ + void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); + void (*assign)(struct sradix_tree_node *node, unsigned index, void *item); + void (*rm)(struct sradix_tree_node *node, unsigned offset); +}; + +struct sradix_tree_path { + struct sradix_tree_node *node; + int offset; +}; + +static inline +void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) +{ + root->height = 0; + root->rnode = NULL; + root->shift = shift; + root->stores_size = 1UL << shift; + root->mask = root->stores_size - 1; +} + + +extern void *sradix_tree_next(struct sradix_tree_root *root, + struct sradix_tree_node *node, unsigned long index, + int (*iter)(void *, unsigned long)); + +extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); + +extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, + struct sradix_tree_node *node, unsigned long index); + +extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); + +#endif /* _LINUX_SRADIX_TREE_H */ diff -Nur linux-4.2/include/linux/tcp.h linux-4.2-pck/include/linux/tcp.h --- linux-4.2/include/linux/tcp.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/tcp.h 2015-09-08 00:48:31.501257347 -0300 @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -328,6 +329,21 @@ struct tcp_md5sig_info __rcu *md5sig_info; #endif +#ifdef CONFIG_TCP_STEALTH +/* Stealth TCP socket configuration */ + struct { + #define TCP_STEALTH_MODE_AUTH BIT(0) + #define TCP_STEALTH_MODE_INTEGRITY BIT(1) + #define TCP_STEALTH_MODE_INTEGRITY_LEN BIT(2) + u8 mode; + u8 secret[MD5_MESSAGE_BYTES]; + u16 integrity_hash; + size_t integrity_len; + struct skb_mstamp mstamp; + bool saw_tsval; + } stealth; +#endif + /* TCP fastopen related information */ struct tcp_fastopen_request *fastopen_req; /* fastopen_rsk points to request_sock that resulted in this big diff -Nur linux-4.2/include/linux/thinkpad_ec.h linux-4.2-pck/include/linux/thinkpad_ec.h --- linux-4.2/include/linux/thinkpad_ec.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/linux/thinkpad_ec.h 2015-09-08 00:48:22.228363880 -0300 @@ -0,0 +1,47 @@ +/* + * thinkpad_ec.h - interface to ThinkPad embedded controller LPC3 functions + * + * Copyright (C) 2005 Shem Multinymous + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _THINKPAD_EC_H +#define _THINKPAD_EC_H + +#ifdef __KERNEL__ + +#define TP_CONTROLLER_ROW_LEN 16 + +/* EC transactions input and output (possibly partial) vectors of 16 bytes. */ +struct thinkpad_ec_row { + u16 mask; /* bitmap of which entries of val[] are meaningful */ + u8 val[TP_CONTROLLER_ROW_LEN]; +}; + +extern int __must_check thinkpad_ec_lock(void); +extern int __must_check thinkpad_ec_try_lock(void); +extern void thinkpad_ec_unlock(void); + +extern int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, + struct thinkpad_ec_row *data); +extern int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, + struct thinkpad_ec_row *mask); +extern int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args); +extern void thinkpad_ec_invalidate(void); + + +#endif /* __KERNEL */ +#endif /* _THINKPAD_EC_H */ diff -Nur linux-4.2/include/linux/uksm.h linux-4.2-pck/include/linux/uksm.h --- linux-4.2/include/linux/uksm.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/linux/uksm.h 2015-09-08 00:51:03.460668141 -0300 @@ -0,0 +1,146 @@ +#ifndef __LINUX_UKSM_H +#define __LINUX_UKSM_H +/* + * Memory merging support. + * + * This code enables dynamic sharing of identical pages found in different + * memory areas, even if they are not shared by fork(). + */ + +/* if !CONFIG_UKSM this file should not be compiled at all. */ +#ifdef CONFIG_UKSM + +#include +#include +#include +#include +#include + +extern unsigned long zero_pfn __read_mostly; +extern unsigned long uksm_zero_pfn __read_mostly; +extern struct page *empty_uksm_zero_page; + +/* must be done before linked to mm */ +extern void uksm_vma_add_new(struct vm_area_struct *vma); +extern void uksm_remove_vma(struct vm_area_struct *vma); + +#define UKSM_SLOT_NEED_SORT (1 << 0) +#define UKSM_SLOT_NEED_RERAND (1 << 1) +#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ +#define UKSM_SLOT_FUL_SCANNED (1 << 3) +#define UKSM_SLOT_IN_UKSM (1 << 4) + +struct vma_slot { + struct sradix_tree_node *snode; + unsigned long sindex; + + struct list_head slot_list; + unsigned long fully_scanned_round; + unsigned long dedup_num; + unsigned long pages_scanned; + unsigned long last_scanned; + unsigned long pages_to_scan; + struct scan_rung *rung; + struct page **rmap_list_pool; + unsigned int *pool_counts; + unsigned long pool_size; + struct vm_area_struct *vma; + struct mm_struct *mm; + unsigned long ctime_j; + unsigned long pages; + unsigned long flags; + unsigned long pages_cowed; /* pages cowed this round */ + unsigned long pages_merged; /* pages merged this round */ + unsigned long pages_bemerged; + + /* when it has page merged in this eval round */ + struct list_head dedup_list; +}; + +static inline void uksm_unmap_zero_page(pte_t pte) +{ + if (pte_pfn(pte) == uksm_zero_pfn) + __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); +} + +static inline void uksm_map_zero_page(pte_t pte) +{ + if (pte_pfn(pte) == uksm_zero_pfn) + __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); +} + +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) +{ + if (vma->uksm_vma_slot && PageKsm(page)) + vma->uksm_vma_slot->pages_cowed++; +} + +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) +{ + if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) + vma->uksm_vma_slot->pages_cowed++; +} + +static inline int uksm_flags_can_scan(unsigned long vm_flags) +{ +#ifndef VM_SAO +#define VM_SAO 0 +#endif + return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_HUGETLB | VM_MIXEDMAP | VM_SHARED + | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN | VM_SAO)); +} + +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) +{ + if (uksm_flags_can_scan(*vm_flags_p)) + *vm_flags_p |= VM_MERGEABLE; +} + +/* + * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will + * be removed when uksm zero page patch is stable enough. + */ +static inline void uksm_bugon_zeropage(pte_t pte) +{ + BUG_ON(pte_pfn(pte) == uksm_zero_pfn); +} +#else +static inline void uksm_vma_add_new(struct vm_area_struct *vma) +{ +} + +static inline void uksm_remove_vma(struct vm_area_struct *vma) +{ +} + +static inline void uksm_unmap_zero_page(pte_t pte) +{ +} + +static inline void uksm_map_zero_page(pte_t pte) +{ +} + +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) +{ +} + +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) +{ +} + +static inline int uksm_flags_can_scan(unsigned long vm_flags) +{ + return 0; +} + +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) +{ +} + +static inline void uksm_bugon_zeropage(pte_t pte) +{ +} +#endif /* !CONFIG_UKSM */ +#endif /* __LINUX_UKSM_H */ diff -Nur linux-4.2/include/net/netfilter/nf_conntrack.h linux-4.2-pck/include/net/netfilter/nf_conntrack.h --- linux-4.2/include/net/netfilter/nf_conntrack.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/net/netfilter/nf_conntrack.h 2015-09-08 00:48:22.228363880 -0300 @@ -292,6 +292,7 @@ void init_nf_conntrack_hash_rnd(void); struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags); +void nf_ct_tmpl_free(struct nf_conn *tmpl); #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) diff -Nur linux-4.2/include/net/secure_seq.h linux-4.2-pck/include/net/secure_seq.h --- linux-4.2/include/net/secure_seq.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/net/secure_seq.h 2015-09-08 00:48:31.501257347 -0300 @@ -14,5 +14,10 @@ __be16 sport, __be16 dport); u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, __be16 sport, __be16 dport); +#ifdef CONFIG_TCP_STEALTH +u32 tcp_stealth_do_auth(struct sock *sk, struct sk_buff *skb); +u32 tcp_stealth_sequence_number(struct sock *sk, __be32 *daddr, + u32 daddr_size, __be16 dport); +#endif #endif /* _NET_SECURE_SEQ */ diff -Nur linux-4.2/include/net/tcp.h linux-4.2-pck/include/net/tcp.h --- linux-4.2/include/net/tcp.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/net/tcp.h 2015-09-08 00:48:31.501257347 -0300 @@ -440,6 +440,12 @@ struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); +#ifdef CONFIG_TCP_STEALTH +const bool tcp_parse_tsval_option(u32 *tsval, const struct tcphdr *th); +int tcp_stealth_integrity(u16 *hash, u8 *secret, u8 *payload, int len); +#define be32_isn_to_be16_av(x) (((__be16 *)&x)[0]) +#define be32_isn_to_be16_ih(x) (((__be16 *)&x)[1]) +#endif /* * TCP v4 functions exported for the inet6 API diff -Nur linux-4.2/include/trace/events/fs.h linux-4.2-pck/include/trace/events/fs.h --- linux-4.2/include/trace/events/fs.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/trace/events/fs.h 2015-09-08 00:48:22.228363880 -0300 @@ -0,0 +1,53 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fs + +#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FS_H + +#include +#include + +TRACE_EVENT(do_sys_open, + + TP_PROTO(const char *filename, int flags, int mode), + + TP_ARGS(filename, flags, mode), + + TP_STRUCT__entry( + __string( filename, filename ) + __field( int, flags ) + __field( int, mode ) + ), + + TP_fast_assign( + __assign_str(filename, filename); + __entry->flags = flags; + __entry->mode = mode; + ), + + TP_printk("\"%s\" %x %o", + __get_str(filename), __entry->flags, __entry->mode) +); + +TRACE_EVENT(open_exec, + + TP_PROTO(const char *filename), + + TP_ARGS(filename), + + TP_STRUCT__entry( + __string( filename, filename ) + ), + + TP_fast_assign( + __assign_str(filename, filename); + ), + + TP_printk("\"%s\"", + __get_str(filename)) +); + +#endif /* _TRACE_FS_H */ + +/* This part must be outside protection */ +#include diff -Nur linux-4.2/include/uapi/linux/Kbuild linux-4.2-pck/include/uapi/linux/Kbuild --- linux-4.2/include/uapi/linux/Kbuild 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/uapi/linux/Kbuild 2015-09-08 00:48:22.228363880 -0300 @@ -216,6 +216,7 @@ header-y += jffs2.h header-y += joystick.h header-y += kcmp.h +header-y += kdbus.h header-y += kdev_t.h header-y += kd.h header-y += kernelcapi.h diff -Nur linux-4.2/include/uapi/linux/kdbus.h linux-4.2-pck/include/uapi/linux/kdbus.h --- linux-4.2/include/uapi/linux/kdbus.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/uapi/linux/kdbus.h 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,984 @@ +/* + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef _UAPI_KDBUS_H_ +#define _UAPI_KDBUS_H_ + +#include +#include + +#define KDBUS_IOCTL_MAGIC 0x95 +#define KDBUS_SRC_ID_KERNEL (0) +#define KDBUS_DST_ID_NAME (0) +#define KDBUS_MATCH_ID_ANY (~0ULL) +#define KDBUS_DST_ID_BROADCAST (~0ULL) +#define KDBUS_FLAG_NEGOTIATE (1ULL << 63) + +/** + * struct kdbus_notify_id_change - name registry change message + * @id: New or former owner of the name + * @flags: flags field from KDBUS_HELLO_* + * + * Sent from kernel to userspace when the owner or activator of + * a well-known name changes. + * + * Attached to: + * KDBUS_ITEM_ID_ADD + * KDBUS_ITEM_ID_REMOVE + */ +struct kdbus_notify_id_change { + __u64 id; + __u64 flags; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_notify_name_change - name registry change message + * @old_id: ID and flags of former owner of a name + * @new_id: ID and flags of new owner of a name + * @name: Well-known name + * + * Sent from kernel to userspace when the owner or activator of + * a well-known name changes. + * + * Attached to: + * KDBUS_ITEM_NAME_ADD + * KDBUS_ITEM_NAME_REMOVE + * KDBUS_ITEM_NAME_CHANGE + */ +struct kdbus_notify_name_change { + struct kdbus_notify_id_change old_id; + struct kdbus_notify_id_change new_id; + char name[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_creds - process credentials + * @uid: User ID + * @euid: Effective UID + * @suid: Saved UID + * @fsuid: Filesystem UID + * @gid: Group ID + * @egid: Effective GID + * @sgid: Saved GID + * @fsgid: Filesystem GID + * + * Attached to: + * KDBUS_ITEM_CREDS + */ +struct kdbus_creds { + __u64 uid; + __u64 euid; + __u64 suid; + __u64 fsuid; + __u64 gid; + __u64 egid; + __u64 sgid; + __u64 fsgid; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_pids - process identifiers + * @pid: Process ID + * @tid: Thread ID + * @ppid: Parent process ID + * + * The PID and TID of a process. + * + * Attached to: + * KDBUS_ITEM_PIDS + */ +struct kdbus_pids { + __u64 pid; + __u64 tid; + __u64 ppid; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_caps - process capabilities + * @last_cap: Highest currently known capability bit + * @caps: Variable number of 32-bit capabilities flags + * + * Contains a variable number of 32-bit capabilities flags. + * + * Attached to: + * KDBUS_ITEM_CAPS + */ +struct kdbus_caps { + __u32 last_cap; + __u32 caps[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_audit - audit information + * @sessionid: The audit session ID + * @loginuid: The audit login uid + * + * Attached to: + * KDBUS_ITEM_AUDIT + */ +struct kdbus_audit { + __u32 sessionid; + __u32 loginuid; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_timestamp + * @seqnum: Global per-domain message sequence number + * @monotonic_ns: Monotonic timestamp, in nanoseconds + * @realtime_ns: Realtime timestamp, in nanoseconds + * + * Attached to: + * KDBUS_ITEM_TIMESTAMP + */ +struct kdbus_timestamp { + __u64 seqnum; + __u64 monotonic_ns; + __u64 realtime_ns; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_vec - I/O vector for kdbus payload items + * @size: The size of the vector + * @address: Memory address of data buffer + * @offset: Offset in the in-message payload memory, + * relative to the message head + * + * Attached to: + * KDBUS_ITEM_PAYLOAD_VEC, KDBUS_ITEM_PAYLOAD_OFF + */ +struct kdbus_vec { + __u64 size; + union { + __u64 address; + __u64 offset; + }; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_bloom_parameter - bus-wide bloom parameters + * @size: Size of the bit field in bytes (m / 8) + * @n_hash: Number of hash functions used (k) + */ +struct kdbus_bloom_parameter { + __u64 size; + __u64 n_hash; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_bloom_filter - bloom filter containing n elements + * @generation: Generation of the element set in the filter + * @data: Bit field, multiple of 8 bytes + */ +struct kdbus_bloom_filter { + __u64 generation; + __u64 data[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_memfd - a kdbus memfd + * @start: The offset into the memfd where the segment starts + * @size: The size of the memfd segment + * @fd: The file descriptor number + * @__pad: Padding to ensure proper alignment and size + * + * Attached to: + * KDBUS_ITEM_PAYLOAD_MEMFD + */ +struct kdbus_memfd { + __u64 start; + __u64 size; + int fd; + __u32 __pad; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_name - a registered well-known name with its flags + * @flags: Flags from KDBUS_NAME_* + * @name: Well-known name + * + * Attached to: + * KDBUS_ITEM_OWNED_NAME + */ +struct kdbus_name { + __u64 flags; + char name[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_policy_access_type - permissions of a policy record + * @_KDBUS_POLICY_ACCESS_NULL: Uninitialized/invalid + * @KDBUS_POLICY_ACCESS_USER: Grant access to a uid + * @KDBUS_POLICY_ACCESS_GROUP: Grant access to gid + * @KDBUS_POLICY_ACCESS_WORLD: World-accessible + */ +enum kdbus_policy_access_type { + _KDBUS_POLICY_ACCESS_NULL, + KDBUS_POLICY_ACCESS_USER, + KDBUS_POLICY_ACCESS_GROUP, + KDBUS_POLICY_ACCESS_WORLD, +}; + +/** + * enum kdbus_policy_access_flags - mode flags + * @KDBUS_POLICY_OWN: Allow to own a well-known name + * Implies KDBUS_POLICY_TALK and KDBUS_POLICY_SEE + * @KDBUS_POLICY_TALK: Allow communication to a well-known name + * Implies KDBUS_POLICY_SEE + * @KDBUS_POLICY_SEE: Allow to see a well-known name + */ +enum kdbus_policy_type { + KDBUS_POLICY_SEE = 0, + KDBUS_POLICY_TALK, + KDBUS_POLICY_OWN, +}; + +/** + * struct kdbus_policy_access - policy access item + * @type: One of KDBUS_POLICY_ACCESS_* types + * @access: Access to grant + * @id: For KDBUS_POLICY_ACCESS_USER, the uid + * For KDBUS_POLICY_ACCESS_GROUP, the gid + */ +struct kdbus_policy_access { + __u64 type; /* USER, GROUP, WORLD */ + __u64 access; /* OWN, TALK, SEE */ + __u64 id; /* uid, gid, 0 */ +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_attach_flags - flags for metadata attachments + * @KDBUS_ATTACH_TIMESTAMP: Timestamp + * @KDBUS_ATTACH_CREDS: Credentials + * @KDBUS_ATTACH_PIDS: PIDs + * @KDBUS_ATTACH_AUXGROUPS: Auxiliary groups + * @KDBUS_ATTACH_NAMES: Well-known names + * @KDBUS_ATTACH_TID_COMM: The "comm" process identifier of the TID + * @KDBUS_ATTACH_PID_COMM: The "comm" process identifier of the PID + * @KDBUS_ATTACH_EXE: The path of the executable + * @KDBUS_ATTACH_CMDLINE: The process command line + * @KDBUS_ATTACH_CGROUP: The croup membership + * @KDBUS_ATTACH_CAPS: The process capabilities + * @KDBUS_ATTACH_SECLABEL: The security label + * @KDBUS_ATTACH_AUDIT: The audit IDs + * @KDBUS_ATTACH_CONN_DESCRIPTION: The human-readable connection name + * @_KDBUS_ATTACH_ALL: All of the above + * @_KDBUS_ATTACH_ANY: Wildcard match to enable any kind of + * metatdata. + */ +enum kdbus_attach_flags { + KDBUS_ATTACH_TIMESTAMP = 1ULL << 0, + KDBUS_ATTACH_CREDS = 1ULL << 1, + KDBUS_ATTACH_PIDS = 1ULL << 2, + KDBUS_ATTACH_AUXGROUPS = 1ULL << 3, + KDBUS_ATTACH_NAMES = 1ULL << 4, + KDBUS_ATTACH_TID_COMM = 1ULL << 5, + KDBUS_ATTACH_PID_COMM = 1ULL << 6, + KDBUS_ATTACH_EXE = 1ULL << 7, + KDBUS_ATTACH_CMDLINE = 1ULL << 8, + KDBUS_ATTACH_CGROUP = 1ULL << 9, + KDBUS_ATTACH_CAPS = 1ULL << 10, + KDBUS_ATTACH_SECLABEL = 1ULL << 11, + KDBUS_ATTACH_AUDIT = 1ULL << 12, + KDBUS_ATTACH_CONN_DESCRIPTION = 1ULL << 13, + _KDBUS_ATTACH_ALL = (1ULL << 14) - 1, + _KDBUS_ATTACH_ANY = ~0ULL +}; + +/** + * enum kdbus_item_type - item types to chain data in a list + * @_KDBUS_ITEM_NULL: Uninitialized/invalid + * @_KDBUS_ITEM_USER_BASE: Start of user items + * @KDBUS_ITEM_NEGOTIATE: Negotiate supported items + * @KDBUS_ITEM_PAYLOAD_VEC: Vector to data + * @KDBUS_ITEM_PAYLOAD_OFF: Data at returned offset to message head + * @KDBUS_ITEM_PAYLOAD_MEMFD: Data as sealed memfd + * @KDBUS_ITEM_FDS: Attached file descriptors + * @KDBUS_ITEM_CANCEL_FD: FD used to cancel a synchronous + * operation by writing to it from + * userspace + * @KDBUS_ITEM_BLOOM_PARAMETER: Bus-wide bloom parameters, used with + * KDBUS_CMD_BUS_MAKE, carries a + * struct kdbus_bloom_parameter + * @KDBUS_ITEM_BLOOM_FILTER: Bloom filter carried with a message, + * used to match against a bloom mask of a + * connection, carries a struct + * kdbus_bloom_filter + * @KDBUS_ITEM_BLOOM_MASK: Bloom mask used to match against a + * message'sbloom filter + * @KDBUS_ITEM_DST_NAME: Destination's well-known name + * @KDBUS_ITEM_MAKE_NAME: Name of domain, bus, endpoint + * @KDBUS_ITEM_ATTACH_FLAGS_SEND: Attach-flags, used for updating which + * metadata a connection opts in to send + * @KDBUS_ITEM_ATTACH_FLAGS_RECV: Attach-flags, used for updating which + * metadata a connection requests to + * receive for each reeceived message + * @KDBUS_ITEM_ID: Connection ID + * @KDBUS_ITEM_NAME: Well-know name with flags + * @_KDBUS_ITEM_ATTACH_BASE: Start of metadata attach items + * @KDBUS_ITEM_TIMESTAMP: Timestamp + * @KDBUS_ITEM_CREDS: Process credentials + * @KDBUS_ITEM_PIDS: Process identifiers + * @KDBUS_ITEM_AUXGROUPS: Auxiliary process groups + * @KDBUS_ITEM_OWNED_NAME: A name owned by the associated + * connection + * @KDBUS_ITEM_TID_COMM: Thread ID "comm" identifier + * (Don't trust this, see below.) + * @KDBUS_ITEM_PID_COMM: Process ID "comm" identifier + * (Don't trust this, see below.) + * @KDBUS_ITEM_EXE: The path of the executable + * (Don't trust this, see below.) + * @KDBUS_ITEM_CMDLINE: The process command line + * (Don't trust this, see below.) + * @KDBUS_ITEM_CGROUP: The croup membership + * @KDBUS_ITEM_CAPS: The process capabilities + * @KDBUS_ITEM_SECLABEL: The security label + * @KDBUS_ITEM_AUDIT: The audit IDs + * @KDBUS_ITEM_CONN_DESCRIPTION: The connection's human-readable name + * (debugging) + * @_KDBUS_ITEM_POLICY_BASE: Start of policy items + * @KDBUS_ITEM_POLICY_ACCESS: Policy access block + * @_KDBUS_ITEM_KERNEL_BASE: Start of kernel-generated message items + * @KDBUS_ITEM_NAME_ADD: Notification in kdbus_notify_name_change + * @KDBUS_ITEM_NAME_REMOVE: Notification in kdbus_notify_name_change + * @KDBUS_ITEM_NAME_CHANGE: Notification in kdbus_notify_name_change + * @KDBUS_ITEM_ID_ADD: Notification in kdbus_notify_id_change + * @KDBUS_ITEM_ID_REMOVE: Notification in kdbus_notify_id_change + * @KDBUS_ITEM_REPLY_TIMEOUT: Timeout has been reached + * @KDBUS_ITEM_REPLY_DEAD: Destination died + * + * N.B: The process and thread COMM fields, as well as the CMDLINE and + * EXE fields may be altered by unprivileged processes und should + * hence *not* used for security decisions. Peers should make use of + * these items only for informational purposes, such as generating log + * records. + */ +enum kdbus_item_type { + _KDBUS_ITEM_NULL, + _KDBUS_ITEM_USER_BASE, + KDBUS_ITEM_NEGOTIATE = _KDBUS_ITEM_USER_BASE, + KDBUS_ITEM_PAYLOAD_VEC, + KDBUS_ITEM_PAYLOAD_OFF, + KDBUS_ITEM_PAYLOAD_MEMFD, + KDBUS_ITEM_FDS, + KDBUS_ITEM_CANCEL_FD, + KDBUS_ITEM_BLOOM_PARAMETER, + KDBUS_ITEM_BLOOM_FILTER, + KDBUS_ITEM_BLOOM_MASK, + KDBUS_ITEM_DST_NAME, + KDBUS_ITEM_MAKE_NAME, + KDBUS_ITEM_ATTACH_FLAGS_SEND, + KDBUS_ITEM_ATTACH_FLAGS_RECV, + KDBUS_ITEM_ID, + KDBUS_ITEM_NAME, + KDBUS_ITEM_DST_ID, + + /* keep these item types in sync with KDBUS_ATTACH_* flags */ + _KDBUS_ITEM_ATTACH_BASE = 0x1000, + KDBUS_ITEM_TIMESTAMP = _KDBUS_ITEM_ATTACH_BASE, + KDBUS_ITEM_CREDS, + KDBUS_ITEM_PIDS, + KDBUS_ITEM_AUXGROUPS, + KDBUS_ITEM_OWNED_NAME, + KDBUS_ITEM_TID_COMM, + KDBUS_ITEM_PID_COMM, + KDBUS_ITEM_EXE, + KDBUS_ITEM_CMDLINE, + KDBUS_ITEM_CGROUP, + KDBUS_ITEM_CAPS, + KDBUS_ITEM_SECLABEL, + KDBUS_ITEM_AUDIT, + KDBUS_ITEM_CONN_DESCRIPTION, + + _KDBUS_ITEM_POLICY_BASE = 0x2000, + KDBUS_ITEM_POLICY_ACCESS = _KDBUS_ITEM_POLICY_BASE, + + _KDBUS_ITEM_KERNEL_BASE = 0x8000, + KDBUS_ITEM_NAME_ADD = _KDBUS_ITEM_KERNEL_BASE, + KDBUS_ITEM_NAME_REMOVE, + KDBUS_ITEM_NAME_CHANGE, + KDBUS_ITEM_ID_ADD, + KDBUS_ITEM_ID_REMOVE, + KDBUS_ITEM_REPLY_TIMEOUT, + KDBUS_ITEM_REPLY_DEAD, +}; + +/** + * struct kdbus_item - chain of data blocks + * @size: Overall data record size + * @type: Kdbus_item type of data + * @data: Generic bytes + * @data32: Generic 32 bit array + * @data64: Generic 64 bit array + * @str: Generic string + * @id: Connection ID + * @vec: KDBUS_ITEM_PAYLOAD_VEC + * @creds: KDBUS_ITEM_CREDS + * @audit: KDBUS_ITEM_AUDIT + * @timestamp: KDBUS_ITEM_TIMESTAMP + * @name: KDBUS_ITEM_NAME + * @bloom_parameter: KDBUS_ITEM_BLOOM_PARAMETER + * @bloom_filter: KDBUS_ITEM_BLOOM_FILTER + * @memfd: KDBUS_ITEM_PAYLOAD_MEMFD + * @name_change: KDBUS_ITEM_NAME_ADD + * KDBUS_ITEM_NAME_REMOVE + * KDBUS_ITEM_NAME_CHANGE + * @id_change: KDBUS_ITEM_ID_ADD + * KDBUS_ITEM_ID_REMOVE + * @policy: KDBUS_ITEM_POLICY_ACCESS + */ +struct kdbus_item { + __u64 size; + __u64 type; + union { + __u8 data[0]; + __u32 data32[0]; + __u64 data64[0]; + char str[0]; + + __u64 id; + struct kdbus_vec vec; + struct kdbus_creds creds; + struct kdbus_pids pids; + struct kdbus_audit audit; + struct kdbus_caps caps; + struct kdbus_timestamp timestamp; + struct kdbus_name name; + struct kdbus_bloom_parameter bloom_parameter; + struct kdbus_bloom_filter bloom_filter; + struct kdbus_memfd memfd; + int fds[0]; + struct kdbus_notify_name_change name_change; + struct kdbus_notify_id_change id_change; + struct kdbus_policy_access policy_access; + }; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_msg_flags - type of message + * @KDBUS_MSG_EXPECT_REPLY: Expect a reply message, used for + * method calls. The userspace-supplied + * cookie identifies the message and the + * respective reply carries the cookie + * in cookie_reply + * @KDBUS_MSG_NO_AUTO_START: Do not start a service if the addressed + * name is not currently active. This flag is + * not looked at by the kernel but only + * serves as hint for userspace implementations. + * @KDBUS_MSG_SIGNAL: Treat this message as signal + */ +enum kdbus_msg_flags { + KDBUS_MSG_EXPECT_REPLY = 1ULL << 0, + KDBUS_MSG_NO_AUTO_START = 1ULL << 1, + KDBUS_MSG_SIGNAL = 1ULL << 2, +}; + +/** + * enum kdbus_payload_type - type of payload carried by message + * @KDBUS_PAYLOAD_KERNEL: Kernel-generated simple message + * @KDBUS_PAYLOAD_DBUS: D-Bus marshalling "DBusDBus" + * + * Any payload-type is accepted. Common types will get added here once + * established. + */ +enum kdbus_payload_type { + KDBUS_PAYLOAD_KERNEL, + KDBUS_PAYLOAD_DBUS = 0x4442757344427573ULL, +}; + +/** + * struct kdbus_msg - the representation of a kdbus message + * @size: Total size of the message + * @flags: Message flags (KDBUS_MSG_*), userspace → kernel + * @priority: Message queue priority value + * @dst_id: 64-bit ID of the destination connection + * @src_id: 64-bit ID of the source connection + * @payload_type: Payload type (KDBUS_PAYLOAD_*) + * @cookie: Userspace-supplied cookie, for the connection + * to identify its messages + * @timeout_ns: The time to wait for a message reply from the peer. + * If there is no reply, and the send command is + * executed asynchronously, a kernel-generated message + * with an attached KDBUS_ITEM_REPLY_TIMEOUT item + * is sent to @src_id. For synchronously executed send + * command, the value denotes the maximum time the call + * blocks to wait for a reply. The timeout is expected in + * nanoseconds and as absolute CLOCK_MONOTONIC value. + * @cookie_reply: A reply to the requesting message with the same + * cookie. The requesting connection can match its + * request and the reply with this value + * @items: A list of kdbus_items containing the message payload + */ +struct kdbus_msg { + __u64 size; + __u64 flags; + __s64 priority; + __u64 dst_id; + __u64 src_id; + __u64 payload_type; + __u64 cookie; + union { + __u64 timeout_ns; + __u64 cookie_reply; + }; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_msg_info - returned message container + * @offset: Offset of kdbus_msg slice in pool + * @msg_size: Copy of the kdbus_msg.size field + * @return_flags: Command return flags, kernel → userspace + */ +struct kdbus_msg_info { + __u64 offset; + __u64 msg_size; + __u64 return_flags; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_send_flags - flags for sending messages + * @KDBUS_SEND_SYNC_REPLY: Wait for destination connection to + * reply to this message. The + * KDBUS_CMD_SEND ioctl() will block + * until the reply is received, and + * reply in struct kdbus_cmd_send will + * yield the offset in the sender's pool + * where the reply can be found. + * This flag is only valid if + * @KDBUS_MSG_EXPECT_REPLY is set as well. + */ +enum kdbus_send_flags { + KDBUS_SEND_SYNC_REPLY = 1ULL << 0, +}; + +/** + * struct kdbus_cmd_send - send message + * @size: Overall size of this structure + * @flags: Flags to change send behavior (KDBUS_SEND_*) + * @return_flags: Command return flags, kernel → userspace + * @msg_address: Storage address of the kdbus_msg to send + * @reply: Storage for message reply if KDBUS_SEND_SYNC_REPLY + * was given + * @items: Additional items for this command + */ +struct kdbus_cmd_send { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 msg_address; + struct kdbus_msg_info reply; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_recv_flags - flags for de-queuing messages + * @KDBUS_RECV_PEEK: Return the next queued message without + * actually de-queuing it, and without installing + * any file descriptors or other resources. It is + * usually used to determine the activating + * connection of a bus name. + * @KDBUS_RECV_DROP: Drop and free the next queued message and all + * its resources without actually receiving it. + * @KDBUS_RECV_USE_PRIORITY: Only de-queue messages with the specified or + * higher priority (lowest values); if not set, + * the priority value is ignored. + */ +enum kdbus_recv_flags { + KDBUS_RECV_PEEK = 1ULL << 0, + KDBUS_RECV_DROP = 1ULL << 1, + KDBUS_RECV_USE_PRIORITY = 1ULL << 2, +}; + +/** + * enum kdbus_recv_return_flags - return flags for message receive commands + * @KDBUS_RECV_RETURN_INCOMPLETE_FDS: One or more file descriptors could not + * be installed. These descriptors in + * KDBUS_ITEM_FDS will carry the value -1. + * @KDBUS_RECV_RETURN_DROPPED_MSGS: There have been dropped messages since + * the last time a message was received. + * The 'dropped_msgs' counter contains the + * number of messages dropped pool + * overflows or other missed broadcasts. + */ +enum kdbus_recv_return_flags { + KDBUS_RECV_RETURN_INCOMPLETE_FDS = 1ULL << 0, + KDBUS_RECV_RETURN_DROPPED_MSGS = 1ULL << 1, +}; + +/** + * struct kdbus_cmd_recv - struct to de-queue a buffered message + * @size: Overall size of this object + * @flags: KDBUS_RECV_* flags, userspace → kernel + * @return_flags: Command return flags, kernel → userspace + * @priority: Minimum priority of the messages to de-queue. Lowest + * values have the highest priority. + * @dropped_msgs: In case there were any dropped messages since the last + * time a message was received, this will be set to the + * number of lost messages and + * KDBUS_RECV_RETURN_DROPPED_MSGS will be set in + * 'return_flags'. This can only happen if the ioctl + * returns 0 or EAGAIN. + * @msg: Return storage for received message. + * @items: Additional items for this command. + * + * This struct is used with the KDBUS_CMD_RECV ioctl. + */ +struct kdbus_cmd_recv { + __u64 size; + __u64 flags; + __u64 return_flags; + __s64 priority; + __u64 dropped_msgs; + struct kdbus_msg_info msg; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_cmd_free - struct to free a slice of memory in the pool + * @size: Overall size of this structure + * @flags: Flags for the free command, userspace → kernel + * @return_flags: Command return flags, kernel → userspace + * @offset: The offset of the memory slice, as returned by other + * ioctls + * @items: Additional items to modify the behavior + * + * This struct is used with the KDBUS_CMD_FREE ioctl. + */ +struct kdbus_cmd_free { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 offset; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_hello_flags - flags for struct kdbus_cmd_hello + * @KDBUS_HELLO_ACCEPT_FD: The connection allows the reception of + * any passed file descriptors + * @KDBUS_HELLO_ACTIVATOR: Special-purpose connection which registers + * a well-know name for a process to be started + * when traffic arrives + * @KDBUS_HELLO_POLICY_HOLDER: Special-purpose connection which registers + * policy entries for a name. The provided name + * is not activated and not registered with the + * name database, it only allows unprivileged + * connections to acquire a name, talk or discover + * a service + * @KDBUS_HELLO_MONITOR: Special-purpose connection to monitor + * bus traffic + */ +enum kdbus_hello_flags { + KDBUS_HELLO_ACCEPT_FD = 1ULL << 0, + KDBUS_HELLO_ACTIVATOR = 1ULL << 1, + KDBUS_HELLO_POLICY_HOLDER = 1ULL << 2, + KDBUS_HELLO_MONITOR = 1ULL << 3, +}; + +/** + * struct kdbus_cmd_hello - struct to say hello to kdbus + * @size: The total size of the structure + * @flags: Connection flags (KDBUS_HELLO_*), userspace → kernel + * @return_flags: Command return flags, kernel → userspace + * @attach_flags_send: Mask of metadata to attach to each message sent + * off by this connection (KDBUS_ATTACH_*) + * @attach_flags_recv: Mask of metadata to attach to each message receieved + * by the new connection (KDBUS_ATTACH_*) + * @bus_flags: The flags field copied verbatim from the original + * KDBUS_CMD_BUS_MAKE ioctl. It's intended to be useful + * to do negotiation of features of the payload that is + * transferred (kernel → userspace) + * @id: The ID of this connection (kernel → userspace) + * @pool_size: Size of the connection's buffer where the received + * messages are placed + * @offset: Pool offset where items are returned to report + * additional information about the bus and the newly + * created connection. + * @items_size: Size of buffer returned in the pool slice at @offset. + * @id128: Unique 128-bit ID of the bus (kernel → userspace) + * @items: A list of items + * + * This struct is used with the KDBUS_CMD_HELLO ioctl. + */ +struct kdbus_cmd_hello { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 attach_flags_send; + __u64 attach_flags_recv; + __u64 bus_flags; + __u64 id; + __u64 pool_size; + __u64 offset; + __u64 items_size; + __u8 id128[16]; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_info - connection information + * @size: total size of the struct + * @id: 64bit object ID + * @flags: object creation flags + * @items: list of items + * + * Note that the user is responsible for freeing the allocated memory with + * the KDBUS_CMD_FREE ioctl. + */ +struct kdbus_info { + __u64 size; + __u64 id; + __u64 flags; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_list_flags - what to include into the returned list + * @KDBUS_LIST_UNIQUE: active connections + * @KDBUS_LIST_ACTIVATORS: activator connections + * @KDBUS_LIST_NAMES: known well-known names + * @KDBUS_LIST_QUEUED: queued-up names + */ +enum kdbus_list_flags { + KDBUS_LIST_UNIQUE = 1ULL << 0, + KDBUS_LIST_NAMES = 1ULL << 1, + KDBUS_LIST_ACTIVATORS = 1ULL << 2, + KDBUS_LIST_QUEUED = 1ULL << 3, +}; + +/** + * struct kdbus_cmd_list - list connections + * @size: overall size of this object + * @flags: flags for the query (KDBUS_LIST_*), userspace → kernel + * @return_flags: command return flags, kernel → userspace + * @offset: Offset in the caller's pool buffer where an array of + * kdbus_info objects is stored. + * The user must use KDBUS_CMD_FREE to free the + * allocated memory. + * @list_size: size of returned list in bytes + * @items: Items for the command. Reserved for future use. + * + * This structure is used with the KDBUS_CMD_LIST ioctl. + */ +struct kdbus_cmd_list { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 offset; + __u64 list_size; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_cmd_info - struct used for KDBUS_CMD_CONN_INFO ioctl + * @size: The total size of the struct + * @flags: Flags for this ioctl, userspace → kernel + * @return_flags: Command return flags, kernel → userspace + * @id: The 64-bit ID of the connection. If set to zero, passing + * @name is required. kdbus will look up the name to + * determine the ID in this case. + * @attach_flags: Set of attach flags to specify the set of information + * to receive, userspace → kernel + * @offset: Returned offset in the caller's pool buffer where the + * kdbus_info struct result is stored. The user must + * use KDBUS_CMD_FREE to free the allocated memory. + * @info_size: Output buffer to report size of data at @offset. + * @items: The optional item list, containing the + * well-known name to look up as a KDBUS_ITEM_NAME. + * Only needed in case @id is zero. + * + * On success, the KDBUS_CMD_CONN_INFO ioctl will return 0 and @offset will + * tell the user the offset in the connection pool buffer at which to find the + * result in a struct kdbus_info. + */ +struct kdbus_cmd_info { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 id; + __u64 attach_flags; + __u64 offset; + __u64 info_size; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_cmd_match_flags - flags to control the KDBUS_CMD_MATCH_ADD ioctl + * @KDBUS_MATCH_REPLACE: If entries with the supplied cookie already + * exists, remove them before installing the new + * matches. + */ +enum kdbus_cmd_match_flags { + KDBUS_MATCH_REPLACE = 1ULL << 0, +}; + +/** + * struct kdbus_cmd_match - struct to add or remove matches + * @size: The total size of the struct + * @flags: Flags for match command (KDBUS_MATCH_*), + * userspace → kernel + * @return_flags: Command return flags, kernel → userspace + * @cookie: Userspace supplied cookie. When removing, the cookie + * identifies the match to remove + * @items: A list of items for additional information + * + * This structure is used with the KDBUS_CMD_MATCH_ADD and + * KDBUS_CMD_MATCH_REMOVE ioctl. + */ +struct kdbus_cmd_match { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 cookie; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_make_flags - Flags for KDBUS_CMD_{BUS,ENDPOINT}_MAKE + * @KDBUS_MAKE_ACCESS_GROUP: Make the bus or endpoint node group-accessible + * @KDBUS_MAKE_ACCESS_WORLD: Make the bus or endpoint node world-accessible + */ +enum kdbus_make_flags { + KDBUS_MAKE_ACCESS_GROUP = 1ULL << 0, + KDBUS_MAKE_ACCESS_WORLD = 1ULL << 1, +}; + +/** + * enum kdbus_name_flags - flags for KDBUS_CMD_NAME_ACQUIRE + * @KDBUS_NAME_REPLACE_EXISTING: Try to replace name of other connections + * @KDBUS_NAME_ALLOW_REPLACEMENT: Allow the replacement of the name + * @KDBUS_NAME_QUEUE: Name should be queued if busy + * @KDBUS_NAME_IN_QUEUE: Name is queued + * @KDBUS_NAME_ACTIVATOR: Name is owned by a activator connection + * @KDBUS_NAME_PRIMARY: Primary owner of the name + * @KDBUS_NAME_ACQUIRED: Name was acquired/queued _now_ + */ +enum kdbus_name_flags { + KDBUS_NAME_REPLACE_EXISTING = 1ULL << 0, + KDBUS_NAME_ALLOW_REPLACEMENT = 1ULL << 1, + KDBUS_NAME_QUEUE = 1ULL << 2, + KDBUS_NAME_IN_QUEUE = 1ULL << 3, + KDBUS_NAME_ACTIVATOR = 1ULL << 4, + KDBUS_NAME_PRIMARY = 1ULL << 5, + KDBUS_NAME_ACQUIRED = 1ULL << 6, +}; + +/** + * struct kdbus_cmd - generic ioctl payload + * @size: Overall size of this structure + * @flags: Flags for this ioctl, userspace → kernel + * @return_flags: Ioctl return flags, kernel → userspace + * @items: Additional items to modify the behavior + * + * This is a generic ioctl payload object. It's used by all ioctls that only + * take flags and items as input. + */ +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * Ioctl API + * + * KDBUS_CMD_BUS_MAKE: After opening the "control" node, this command + * creates a new bus with the specified + * name. The bus is immediately shut down and + * cleaned up when the opened file descriptor is + * closed. + * + * KDBUS_CMD_ENDPOINT_MAKE: Creates a new named special endpoint to talk to + * the bus. Such endpoints usually carry a more + * restrictive policy and grant restricted access + * to specific applications. + * KDBUS_CMD_ENDPOINT_UPDATE: Update the properties of a custom enpoint. Used + * to update the policy. + * + * KDBUS_CMD_HELLO: By opening the bus node, a connection is + * created. After a HELLO the opened connection + * becomes an active peer on the bus. + * KDBUS_CMD_UPDATE: Update the properties of a connection. Used to + * update the metadata subscription mask and + * policy. + * KDBUS_CMD_BYEBYE: Disconnect a connection. If there are no + * messages queued up in the connection's pool, + * the call succeeds, and the handle is rendered + * unusable. Otherwise, -EBUSY is returned without + * any further side-effects. + * KDBUS_CMD_FREE: Release the allocated memory in the receiver's + * pool. + * KDBUS_CMD_CONN_INFO: Retrieve credentials and properties of the + * initial creator of the connection. The data was + * stored at registration time and does not + * necessarily represent the connected process or + * the actual state of the process. + * KDBUS_CMD_BUS_CREATOR_INFO: Retrieve information of the creator of the bus + * a connection is attached to. + * + * KDBUS_CMD_SEND: Send a message and pass data from userspace to + * the kernel. + * KDBUS_CMD_RECV: Receive a message from the kernel which is + * placed in the receiver's pool. + * + * KDBUS_CMD_NAME_ACQUIRE: Request a well-known bus name to associate with + * the connection. Well-known names are used to + * address a peer on the bus. + * KDBUS_CMD_NAME_RELEASE: Release a well-known name the connection + * currently owns. + * KDBUS_CMD_LIST: Retrieve the list of all currently registered + * well-known and unique names. + * + * KDBUS_CMD_MATCH_ADD: Install a match which broadcast messages should + * be delivered to the connection. + * KDBUS_CMD_MATCH_REMOVE: Remove a current match for broadcast messages. + */ +enum kdbus_ioctl_type { + /* bus owner (00-0f) */ + KDBUS_CMD_BUS_MAKE = _IOW(KDBUS_IOCTL_MAGIC, 0x00, + struct kdbus_cmd), + + /* endpoint owner (10-1f) */ + KDBUS_CMD_ENDPOINT_MAKE = _IOW(KDBUS_IOCTL_MAGIC, 0x10, + struct kdbus_cmd), + KDBUS_CMD_ENDPOINT_UPDATE = _IOW(KDBUS_IOCTL_MAGIC, 0x11, + struct kdbus_cmd), + + /* connection owner (80-ff) */ + KDBUS_CMD_HELLO = _IOWR(KDBUS_IOCTL_MAGIC, 0x80, + struct kdbus_cmd_hello), + KDBUS_CMD_UPDATE = _IOW(KDBUS_IOCTL_MAGIC, 0x81, + struct kdbus_cmd), + KDBUS_CMD_BYEBYE = _IOW(KDBUS_IOCTL_MAGIC, 0x82, + struct kdbus_cmd), + KDBUS_CMD_FREE = _IOW(KDBUS_IOCTL_MAGIC, 0x83, + struct kdbus_cmd_free), + KDBUS_CMD_CONN_INFO = _IOR(KDBUS_IOCTL_MAGIC, 0x84, + struct kdbus_cmd_info), + KDBUS_CMD_BUS_CREATOR_INFO = _IOR(KDBUS_IOCTL_MAGIC, 0x85, + struct kdbus_cmd_info), + KDBUS_CMD_LIST = _IOR(KDBUS_IOCTL_MAGIC, 0x86, + struct kdbus_cmd_list), + + KDBUS_CMD_SEND = _IOW(KDBUS_IOCTL_MAGIC, 0x90, + struct kdbus_cmd_send), + KDBUS_CMD_RECV = _IOR(KDBUS_IOCTL_MAGIC, 0x91, + struct kdbus_cmd_recv), + + KDBUS_CMD_NAME_ACQUIRE = _IOW(KDBUS_IOCTL_MAGIC, 0xa0, + struct kdbus_cmd), + KDBUS_CMD_NAME_RELEASE = _IOW(KDBUS_IOCTL_MAGIC, 0xa1, + struct kdbus_cmd), + + KDBUS_CMD_MATCH_ADD = _IOW(KDBUS_IOCTL_MAGIC, 0xb0, + struct kdbus_cmd_match), + KDBUS_CMD_MATCH_REMOVE = _IOW(KDBUS_IOCTL_MAGIC, 0xb1, + struct kdbus_cmd_match), +}; + +#endif /* _UAPI_KDBUS_H_ */ diff -Nur linux-4.2/include/uapi/linux/magic.h linux-4.2-pck/include/uapi/linux/magic.h --- linux-4.2/include/uapi/linux/magic.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/uapi/linux/magic.h 2015-09-08 00:48:22.231697056 -0300 @@ -76,4 +76,6 @@ #define BTRFS_TEST_MAGIC 0x73727279 #define NSFS_MAGIC 0x6e736673 +#define KDBUS_SUPER_MAGIC 0x44427573 + #endif /* __LINUX_MAGIC_H__ */ diff -Nur linux-4.2/include/uapi/linux/tcp.h linux-4.2-pck/include/uapi/linux/tcp.h --- linux-4.2/include/uapi/linux/tcp.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/uapi/linux/tcp.h 2015-09-08 00:48:31.501257347 -0300 @@ -115,6 +115,9 @@ #define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ #define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */ #define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */ +#define TCP_STEALTH 29 +#define TCP_STEALTH_INTEGRITY 30 +#define TCP_STEALTH_INTEGRITY_LEN 31 struct tcp_repair_opt { __u32 opt_code; diff -Nur linux-4.2/include/uapi/linux/vt.h linux-4.2-pck/include/uapi/linux/vt.h --- linux-4.2/include/uapi/linux/vt.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/uapi/linux/vt.h 2015-09-08 00:48:22.231697056 -0300 @@ -3,12 +3,26 @@ /* + * We will make this definition solely for the purpose of making packages + * such as splashutils build, because they can not understand that + * NR_TTY_DEVICES is defined in the kernel configuration. + */ +#ifndef CONFIG_NR_TTY_DEVICES +#define CONFIG_NR_TTY_DEVICES 63 +#endif + +/* * These constants are also useful for user-level apps (e.g., VC * resizing). */ #define MIN_NR_CONSOLES 1 /* must be at least 1 */ -#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ -#define MAX_NR_USER_CONSOLES 63 /* must be root to allocate above this */ + +/* + * NR_TTY_DEVICES: + * Value MUST be at least 11 and must never be higher then 63 + */ +#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ +#define MAX_NR_USER_CONSOLES CONFIG_NR_TTY_DEVICES /* must be root to allocate above this */ /* Note: the ioctl VT_GETSTATE does not work for consoles 16 and higher (since it returns a short) */ diff -Nur linux-4.2/init/Kconfig linux-4.2-pck/init/Kconfig --- linux-4.2/init/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/init/Kconfig 2015-09-08 00:48:22.231697056 -0300 @@ -28,6 +28,26 @@ menu "General setup" +config PCK_INTERACTIVE + bool "Tune kernel for interactivity" + default y + help + Tunes the kernel for responsiveness at the cost of throughput and power usage. + + --- VM --- + Mem dirty before bg writeback..: 10 % -> 128 MiB + Mem dirty before sync writeback: 20 % -> 256 MiB + + --- CPU Scheduler --- + Scheduling latency.............: 6 -> 3 ms + Minimal granularity............: 0.75 -> 0.3 ms + Wakeup granularity.............: 1 -> 0.5 ms + CPU migration cost.............: 0.5 -> 0.25 ms + Bandwidth slice size...........: 5 -> 3 ms + + --- CPU Frequency Scaling --- + Ondemand down scaling factor...: 1 -> 10 + config BROKEN bool @@ -261,6 +281,19 @@ depends on SYSCTL default y +config KDBUS + tristate "kdbus interprocess communication" + depends on TMPFS + help + D-Bus is a system for low-latency, low-overhead, easy to use + interprocess communication (IPC). + + See the man-pages and HTML files in Documentation/kdbus/ + that are generated by 'make mandocs' and 'make htmldocs'. + + If you have an ordinary machine, select M here. The module + will be called kdbus. + config CROSS_MEMORY_ATTACH bool "Enable process_vm_readv/writev syscalls" depends on MMU @@ -1137,7 +1170,7 @@ endif # CGROUPS config CHECKPOINT_RESTORE - bool "Checkpoint/restore support" if EXPERT + bool "Checkpoint/restore support" select PROC_CHILDREN default n help diff -Nur linux-4.2/ipc/Makefile linux-4.2-pck/ipc/Makefile --- linux-4.2/ipc/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/ipc/Makefile 2015-09-08 00:48:22.231697056 -0300 @@ -9,4 +9,4 @@ obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) obj-$(CONFIG_IPC_NS) += namespace.o obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o - +obj-$(CONFIG_KDBUS) += kdbus/ diff -Nur linux-4.2/ipc/kdbus/Makefile linux-4.2-pck/ipc/kdbus/Makefile --- linux-4.2/ipc/kdbus/Makefile 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/Makefile 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,33 @@ +# +# By setting KDBUS_EXT=2, the kdbus module will be built as kdbus2.ko, and +# KBUILD_MODNAME=kdbus2. This has the effect that all exported objects have +# different names than usually (kdbus2fs, /sys/fs/kdbus2/) and you can run +# your test-infrastructure against the kdbus2.ko, while running your system +# on kdbus.ko. +# +# To just build the module, use: +# make KDBUS_EXT=2 M=ipc/kdbus +# + +kdbus$(KDBUS_EXT)-y := \ + bus.o \ + connection.o \ + endpoint.o \ + fs.o \ + handle.o \ + item.o \ + main.o \ + match.o \ + message.o \ + metadata.o \ + names.o \ + node.o \ + notify.o \ + domain.o \ + policy.o \ + pool.o \ + reply.o \ + queue.o \ + util.o + +obj-$(CONFIG_KDBUS) += kdbus$(KDBUS_EXT).o diff -Nur linux-4.2/ipc/kdbus/bus.c linux-4.2-pck/ipc/kdbus/bus.c --- linux-4.2/ipc/kdbus/bus.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/bus.c 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,514 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "notify.h" +#include "connection.h" +#include "domain.h" +#include "endpoint.h" +#include "handle.h" +#include "item.h" +#include "match.h" +#include "message.h" +#include "metadata.h" +#include "names.h" +#include "policy.h" +#include "util.h" + +static void kdbus_bus_free(struct kdbus_node *node) +{ + struct kdbus_bus *bus = container_of(node, struct kdbus_bus, node); + + WARN_ON(!list_empty(&bus->monitors_list)); + WARN_ON(!hash_empty(bus->conn_hash)); + + kdbus_notify_free(bus); + + kdbus_user_unref(bus->creator); + kdbus_name_registry_free(bus->name_registry); + kdbus_domain_unref(bus->domain); + kdbus_policy_db_clear(&bus->policy_db); + kdbus_meta_proc_unref(bus->creator_meta); + kfree(bus); +} + +static void kdbus_bus_release(struct kdbus_node *node, bool was_active) +{ + struct kdbus_bus *bus = container_of(node, struct kdbus_bus, node); + + if (was_active) + atomic_dec(&bus->creator->buses); +} + +static struct kdbus_bus *kdbus_bus_new(struct kdbus_domain *domain, + const char *name, + struct kdbus_bloom_parameter *bloom, + const u64 *pattach_owner, + u64 flags, kuid_t uid, kgid_t gid) +{ + struct kdbus_bus *b; + u64 attach_owner; + int ret; + + if (bloom->size < 8 || bloom->size > KDBUS_BUS_BLOOM_MAX_SIZE || + !KDBUS_IS_ALIGNED8(bloom->size) || bloom->n_hash < 1) + return ERR_PTR(-EINVAL); + + ret = kdbus_sanitize_attach_flags(pattach_owner ? *pattach_owner : 0, + &attach_owner); + if (ret < 0) + return ERR_PTR(ret); + + ret = kdbus_verify_uid_prefix(name, domain->user_namespace, uid); + if (ret < 0) + return ERR_PTR(ret); + + b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) + return ERR_PTR(-ENOMEM); + + kdbus_node_init(&b->node, KDBUS_NODE_BUS); + + b->node.free_cb = kdbus_bus_free; + b->node.release_cb = kdbus_bus_release; + b->node.uid = uid; + b->node.gid = gid; + b->node.mode = S_IRUSR | S_IXUSR; + + if (flags & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD)) + b->node.mode |= S_IRGRP | S_IXGRP; + if (flags & KDBUS_MAKE_ACCESS_WORLD) + b->node.mode |= S_IROTH | S_IXOTH; + + b->id = atomic64_inc_return(&domain->last_id); + b->bus_flags = flags; + b->attach_flags_owner = attach_owner; + generate_random_uuid(b->id128); + b->bloom = *bloom; + b->domain = kdbus_domain_ref(domain); + + kdbus_policy_db_init(&b->policy_db); + + init_rwsem(&b->conn_rwlock); + hash_init(b->conn_hash); + INIT_LIST_HEAD(&b->monitors_list); + + INIT_LIST_HEAD(&b->notify_list); + spin_lock_init(&b->notify_lock); + mutex_init(&b->notify_flush_lock); + + ret = kdbus_node_link(&b->node, &domain->node, name); + if (ret < 0) + goto exit_unref; + + /* cache the metadata/credentials of the creator */ + b->creator_meta = kdbus_meta_proc_new(); + if (IS_ERR(b->creator_meta)) { + ret = PTR_ERR(b->creator_meta); + b->creator_meta = NULL; + goto exit_unref; + } + + ret = kdbus_meta_proc_collect(b->creator_meta, + KDBUS_ATTACH_CREDS | + KDBUS_ATTACH_PIDS | + KDBUS_ATTACH_AUXGROUPS | + KDBUS_ATTACH_TID_COMM | + KDBUS_ATTACH_PID_COMM | + KDBUS_ATTACH_EXE | + KDBUS_ATTACH_CMDLINE | + KDBUS_ATTACH_CGROUP | + KDBUS_ATTACH_CAPS | + KDBUS_ATTACH_SECLABEL | + KDBUS_ATTACH_AUDIT); + if (ret < 0) + goto exit_unref; + + b->name_registry = kdbus_name_registry_new(); + if (IS_ERR(b->name_registry)) { + ret = PTR_ERR(b->name_registry); + b->name_registry = NULL; + goto exit_unref; + } + + /* + * Bus-limits of the creator are accounted on its real UID, just like + * all other per-user limits. + */ + b->creator = kdbus_user_lookup(domain, current_uid()); + if (IS_ERR(b->creator)) { + ret = PTR_ERR(b->creator); + b->creator = NULL; + goto exit_unref; + } + + return b; + +exit_unref: + kdbus_node_drain(&b->node); + kdbus_node_unref(&b->node); + return ERR_PTR(ret); +} + +/** + * kdbus_bus_ref() - increase the reference counter of a kdbus_bus + * @bus: The bus to reference + * + * Every user of a bus, except for its creator, must add a reference to the + * kdbus_bus using this function. + * + * Return: the bus itself + */ +struct kdbus_bus *kdbus_bus_ref(struct kdbus_bus *bus) +{ + if (bus) + kdbus_node_ref(&bus->node); + return bus; +} + +/** + * kdbus_bus_unref() - decrease the reference counter of a kdbus_bus + * @bus: The bus to unref + * + * Release a reference. If the reference count drops to 0, the bus will be + * freed. + * + * Return: NULL + */ +struct kdbus_bus *kdbus_bus_unref(struct kdbus_bus *bus) +{ + if (bus) + kdbus_node_unref(&bus->node); + return NULL; +} + +/** + * kdbus_bus_find_conn_by_id() - find a connection with a given id + * @bus: The bus to look for the connection + * @id: The 64-bit connection id + * + * Looks up a connection with a given id. The returned connection + * is ref'ed, and needs to be unref'ed by the user. Returns NULL if + * the connection can't be found. + */ +struct kdbus_conn *kdbus_bus_find_conn_by_id(struct kdbus_bus *bus, u64 id) +{ + struct kdbus_conn *conn, *found = NULL; + + down_read(&bus->conn_rwlock); + hash_for_each_possible(bus->conn_hash, conn, hentry, id) + if (conn->id == id) { + found = kdbus_conn_ref(conn); + break; + } + up_read(&bus->conn_rwlock); + + return found; +} + +/** + * kdbus_bus_broadcast() - send a message to all subscribed connections + * @bus: The bus the connections are connected to + * @conn_src: The source connection, may be %NULL for kernel notifications + * @staging: Staging object containing the message to send + * + * Send message to all connections that are currently active on the bus. + * Connections must still have matches installed in order to let the message + * pass. + * + * The caller must hold the name-registry lock of @bus. + */ +void kdbus_bus_broadcast(struct kdbus_bus *bus, + struct kdbus_conn *conn_src, + struct kdbus_staging *staging) +{ + struct kdbus_conn *conn_dst; + unsigned int i; + int ret; + + lockdep_assert_held(&bus->name_registry->rwlock); + + /* + * Make sure broadcast are queued on monitors before we send it out to + * anyone else. Otherwise, connections might react to broadcasts before + * the monitor gets the broadcast queued. In the worst case, the + * monitor sees a reaction to the broadcast before the broadcast itself. + * We don't give ordering guarantees across connections (and monitors + * can re-construct order via sequence numbers), but we should at least + * try to avoid re-ordering for monitors. + */ + kdbus_bus_eavesdrop(bus, conn_src, staging); + + down_read(&bus->conn_rwlock); + hash_for_each(bus->conn_hash, i, conn_dst, hentry) { + if (!kdbus_conn_is_ordinary(conn_dst)) + continue; + + /* + * Check if there is a match for the kmsg object in + * the destination connection match db + */ + if (!kdbus_match_db_match_msg(conn_dst->match_db, conn_src, + staging)) + continue; + + if (conn_src) { + /* + * Anyone can send broadcasts, as they have no + * destination. But a receiver needs TALK access to + * the sender in order to receive broadcasts. + */ + if (!kdbus_conn_policy_talk(conn_dst, NULL, conn_src)) + continue; + } else { + /* + * Check if there is a policy db that prevents the + * destination connection from receiving this kernel + * notification + */ + if (!kdbus_conn_policy_see_notification(conn_dst, NULL, + staging->msg)) + continue; + } + + ret = kdbus_conn_entry_insert(conn_src, conn_dst, staging, + NULL, NULL); + if (ret < 0) + kdbus_conn_lost_message(conn_dst); + } + up_read(&bus->conn_rwlock); +} + +/** + * kdbus_bus_eavesdrop() - send a message to all subscribed monitors + * @bus: The bus the monitors are connected to + * @conn_src: The source connection, may be %NULL for kernel notifications + * @staging: Staging object containing the message to send + * + * Send message to all monitors that are currently active on the bus. Monitors + * must still have matches installed in order to let the message pass. + * + * The caller must hold the name-registry lock of @bus. + */ +void kdbus_bus_eavesdrop(struct kdbus_bus *bus, + struct kdbus_conn *conn_src, + struct kdbus_staging *staging) +{ + struct kdbus_conn *conn_dst; + int ret; + + /* + * Monitor connections get all messages; ignore possible errors + * when sending messages to monitor connections. + */ + + lockdep_assert_held(&bus->name_registry->rwlock); + + down_read(&bus->conn_rwlock); + list_for_each_entry(conn_dst, &bus->monitors_list, monitor_entry) { + ret = kdbus_conn_entry_insert(conn_src, conn_dst, staging, + NULL, NULL); + if (ret < 0) + kdbus_conn_lost_message(conn_dst); + } + up_read(&bus->conn_rwlock); +} + +/** + * kdbus_cmd_bus_make() - handle KDBUS_CMD_BUS_MAKE + * @domain: domain to operate on + * @argp: command payload + * + * Return: NULL or newly created bus on success, ERR_PTR on failure. + */ +struct kdbus_bus *kdbus_cmd_bus_make(struct kdbus_domain *domain, + void __user *argp) +{ + struct kdbus_bus *bus = NULL; + struct kdbus_cmd *cmd; + struct kdbus_ep *ep = NULL; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_MAKE_NAME, .mandatory = true }, + { .type = KDBUS_ITEM_BLOOM_PARAMETER, .mandatory = true }, + { .type = KDBUS_ITEM_ATTACH_FLAGS_SEND }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_MAKE_ACCESS_GROUP | + KDBUS_MAKE_ACCESS_WORLD, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + + bus = kdbus_bus_new(domain, + argv[1].item->str, &argv[2].item->bloom_parameter, + argv[3].item ? argv[3].item->data64 : NULL, + cmd->flags, current_euid(), current_egid()); + if (IS_ERR(bus)) { + ret = PTR_ERR(bus); + bus = NULL; + goto exit; + } + + if (atomic_inc_return(&bus->creator->buses) > KDBUS_USER_MAX_BUSES) { + atomic_dec(&bus->creator->buses); + ret = -EMFILE; + goto exit; + } + + if (!kdbus_node_activate(&bus->node)) { + atomic_dec(&bus->creator->buses); + ret = -ESHUTDOWN; + goto exit; + } + + ep = kdbus_ep_new(bus, "bus", cmd->flags, bus->node.uid, bus->node.gid, + false); + if (IS_ERR(ep)) { + ret = PTR_ERR(ep); + ep = NULL; + goto exit; + } + + if (!kdbus_node_activate(&ep->node)) { + ret = -ESHUTDOWN; + goto exit; + } + + /* + * Drop our own reference, effectively causing the endpoint to be + * deactivated and released when the parent bus is. + */ + ep = kdbus_ep_unref(ep); + +exit: + ret = kdbus_args_clear(&args, ret); + if (ret < 0) { + if (ep) { + kdbus_node_drain(&ep->node); + kdbus_ep_unref(ep); + } + if (bus) { + kdbus_node_drain(&bus->node); + kdbus_bus_unref(bus); + } + return ERR_PTR(ret); + } + return bus; +} + +/** + * kdbus_cmd_bus_creator_info() - handle KDBUS_CMD_BUS_CREATOR_INFO + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_bus_creator_info(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_cmd_info *cmd; + struct kdbus_bus *bus = conn->ep->bus; + struct kdbus_pool_slice *slice = NULL; + struct kdbus_item *meta_items = NULL; + struct kdbus_item_header item_hdr; + struct kdbus_info info = {}; + size_t meta_size, name_len, cnt = 0; + struct kvec kvec[6]; + u64 attach_flags, size = 0; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + ret = kdbus_sanitize_attach_flags(cmd->attach_flags, &attach_flags); + if (ret < 0) + goto exit; + + attach_flags &= bus->attach_flags_owner; + + ret = kdbus_meta_emit(bus->creator_meta, NULL, NULL, conn, + attach_flags, &meta_items, &meta_size); + if (ret < 0) + goto exit; + + name_len = strlen(bus->node.name) + 1; + info.id = bus->id; + info.flags = bus->bus_flags; + item_hdr.type = KDBUS_ITEM_MAKE_NAME; + item_hdr.size = KDBUS_ITEM_HEADER_SIZE + name_len; + + kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &size); + kdbus_kvec_set(&kvec[cnt++], &item_hdr, sizeof(item_hdr), &size); + kdbus_kvec_set(&kvec[cnt++], bus->node.name, name_len, &size); + cnt += !!kdbus_kvec_pad(&kvec[cnt], &size); + if (meta_size > 0) { + kdbus_kvec_set(&kvec[cnt++], meta_items, meta_size, &size); + cnt += !!kdbus_kvec_pad(&kvec[cnt], &size); + } + + info.size = size; + + slice = kdbus_pool_slice_alloc(conn->pool, size, false); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto exit; + } + + ret = kdbus_pool_slice_copy_kvec(slice, 0, kvec, cnt, size); + if (ret < 0) + goto exit; + + kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->info_size); + + if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) || + kdbus_member_set_user(&cmd->info_size, argp, + typeof(*cmd), info_size)) + ret = -EFAULT; + +exit: + kdbus_pool_slice_release(slice); + kfree(meta_items); + return kdbus_args_clear(&args, ret); +} diff -Nur linux-4.2/ipc/kdbus/bus.h linux-4.2-pck/ipc/kdbus/bus.h --- linux-4.2/ipc/kdbus/bus.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/bus.h 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_BUS_H +#define __KDBUS_BUS_H + +#include +#include +#include +#include +#include +#include + +#include "metadata.h" +#include "names.h" +#include "node.h" +#include "policy.h" + +struct kdbus_conn; +struct kdbus_domain; +struct kdbus_staging; +struct kdbus_user; + +/** + * struct kdbus_bus - bus in a domain + * @node: kdbus_node + * @id: ID of this bus in the domain + * @bus_flags: Simple pass-through flags from userspace to userspace + * @attach_flags_owner: KDBUS_ATTACH_* flags of bus creator that other + * connections can see or query + * @id128: Unique random 128 bit ID of this bus + * @bloom: Bloom parameters + * @domain: Domain of this bus + * @creator: Creator of the bus + * @creator_meta: Meta information about the bus creator + * @last_message_id: Last used message id + * @policy_db: Policy database for this bus + * @name_registry: Name registry of this bus + * @conn_rwlock: Read/Write lock for all lists of child connections + * @conn_hash: Map of connection IDs + * @monitors_list: Connections that monitor this bus + * @notify_list: List of pending kernel-generated messages + * @notify_lock: Notification list lock + * @notify_flush_lock: Notification flushing lock + */ +struct kdbus_bus { + struct kdbus_node node; + + /* static */ + u64 id; + u64 bus_flags; + u64 attach_flags_owner; + u8 id128[16]; + struct kdbus_bloom_parameter bloom; + struct kdbus_domain *domain; + struct kdbus_user *creator; + struct kdbus_meta_proc *creator_meta; + + /* protected by own locks */ + atomic64_t last_message_id; + struct kdbus_policy_db policy_db; + struct kdbus_name_registry *name_registry; + + /* protected by conn_rwlock */ + struct rw_semaphore conn_rwlock; + DECLARE_HASHTABLE(conn_hash, 8); + struct list_head monitors_list; + + /* protected by notify_lock */ + struct list_head notify_list; + spinlock_t notify_lock; + struct mutex notify_flush_lock; +}; + +struct kdbus_bus *kdbus_bus_ref(struct kdbus_bus *bus); +struct kdbus_bus *kdbus_bus_unref(struct kdbus_bus *bus); + +struct kdbus_conn *kdbus_bus_find_conn_by_id(struct kdbus_bus *bus, u64 id); +void kdbus_bus_broadcast(struct kdbus_bus *bus, + struct kdbus_conn *conn_src, + struct kdbus_staging *staging); +void kdbus_bus_eavesdrop(struct kdbus_bus *bus, + struct kdbus_conn *conn_src, + struct kdbus_staging *staging); + +struct kdbus_bus *kdbus_cmd_bus_make(struct kdbus_domain *domain, + void __user *argp); +int kdbus_cmd_bus_creator_info(struct kdbus_conn *conn, void __user *argp); + +#endif diff -Nur linux-4.2/ipc/kdbus/connection.c linux-4.2-pck/ipc/kdbus/connection.c --- linux-4.2/ipc/kdbus/connection.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/connection.c 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,2227 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "handle.h" +#include "match.h" +#include "message.h" +#include "metadata.h" +#include "names.h" +#include "domain.h" +#include "item.h" +#include "notify.h" +#include "policy.h" +#include "pool.h" +#include "reply.h" +#include "util.h" +#include "queue.h" + +#define KDBUS_CONN_ACTIVE_BIAS (INT_MIN + 2) +#define KDBUS_CONN_ACTIVE_NEW (INT_MIN + 1) + +static struct kdbus_conn *kdbus_conn_new(struct kdbus_ep *ep, + struct file *file, + struct kdbus_cmd_hello *hello, + const char *name, + const struct kdbus_creds *creds, + const struct kdbus_pids *pids, + const char *seclabel, + const char *conn_description) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key __key; +#endif + struct kdbus_pool_slice *slice = NULL; + struct kdbus_bus *bus = ep->bus; + struct kdbus_conn *conn; + u64 attach_flags_send; + u64 attach_flags_recv; + u64 items_size = 0; + bool is_policy_holder; + bool is_activator; + bool is_monitor; + bool privileged; + bool owner; + struct kvec kvec; + int ret; + + struct { + u64 size; + u64 type; + struct kdbus_bloom_parameter bloom; + } bloom_item; + + privileged = kdbus_ep_is_privileged(ep, file); + owner = kdbus_ep_is_owner(ep, file); + + is_monitor = hello->flags & KDBUS_HELLO_MONITOR; + is_activator = hello->flags & KDBUS_HELLO_ACTIVATOR; + is_policy_holder = hello->flags & KDBUS_HELLO_POLICY_HOLDER; + + if (!hello->pool_size || !IS_ALIGNED(hello->pool_size, PAGE_SIZE)) + return ERR_PTR(-EINVAL); + if (is_monitor + is_activator + is_policy_holder > 1) + return ERR_PTR(-EINVAL); + if (name && !is_activator && !is_policy_holder) + return ERR_PTR(-EINVAL); + if (!name && (is_activator || is_policy_holder)) + return ERR_PTR(-EINVAL); + if (name && !kdbus_name_is_valid(name, true)) + return ERR_PTR(-EINVAL); + if (is_monitor && ep->user) + return ERR_PTR(-EOPNOTSUPP); + if (!owner && (is_activator || is_policy_holder || is_monitor)) + return ERR_PTR(-EPERM); + if (!owner && (creds || pids || seclabel)) + return ERR_PTR(-EPERM); + + ret = kdbus_sanitize_attach_flags(hello->attach_flags_send, + &attach_flags_send); + if (ret < 0) + return ERR_PTR(ret); + + ret = kdbus_sanitize_attach_flags(hello->attach_flags_recv, + &attach_flags_recv); + if (ret < 0) + return ERR_PTR(ret); + + conn = kzalloc(sizeof(*conn), GFP_KERNEL); + if (!conn) + return ERR_PTR(-ENOMEM); + + kref_init(&conn->kref); + atomic_set(&conn->active, KDBUS_CONN_ACTIVE_NEW); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_init_map(&conn->dep_map, "s_active", &__key, 0); +#endif + mutex_init(&conn->lock); + INIT_LIST_HEAD(&conn->names_list); + INIT_LIST_HEAD(&conn->reply_list); + atomic_set(&conn->request_count, 0); + atomic_set(&conn->lost_count, 0); + INIT_DELAYED_WORK(&conn->work, kdbus_reply_list_scan_work); + conn->cred = get_cred(file->f_cred); + conn->pid = get_pid(task_pid(current)); + get_fs_root(current->fs, &conn->root_path); + init_waitqueue_head(&conn->wait); + kdbus_queue_init(&conn->queue); + conn->privileged = privileged; + conn->owner = owner; + conn->ep = kdbus_ep_ref(ep); + conn->id = atomic64_inc_return(&bus->domain->last_id); + conn->flags = hello->flags; + atomic64_set(&conn->attach_flags_send, attach_flags_send); + atomic64_set(&conn->attach_flags_recv, attach_flags_recv); + INIT_LIST_HEAD(&conn->monitor_entry); + + if (conn_description) { + conn->description = kstrdup(conn_description, GFP_KERNEL); + if (!conn->description) { + ret = -ENOMEM; + goto exit_unref; + } + } + + conn->pool = kdbus_pool_new(conn->description, hello->pool_size); + if (IS_ERR(conn->pool)) { + ret = PTR_ERR(conn->pool); + conn->pool = NULL; + goto exit_unref; + } + + conn->match_db = kdbus_match_db_new(); + if (IS_ERR(conn->match_db)) { + ret = PTR_ERR(conn->match_db); + conn->match_db = NULL; + goto exit_unref; + } + + /* return properties of this connection to the caller */ + hello->bus_flags = bus->bus_flags; + hello->id = conn->id; + + BUILD_BUG_ON(sizeof(bus->id128) != sizeof(hello->id128)); + memcpy(hello->id128, bus->id128, sizeof(hello->id128)); + + /* privileged processes can impersonate somebody else */ + if (creds || pids || seclabel) { + conn->meta_fake = kdbus_meta_fake_new(); + if (IS_ERR(conn->meta_fake)) { + ret = PTR_ERR(conn->meta_fake); + conn->meta_fake = NULL; + goto exit_unref; + } + + ret = kdbus_meta_fake_collect(conn->meta_fake, + creds, pids, seclabel); + if (ret < 0) + goto exit_unref; + } else { + conn->meta_proc = kdbus_meta_proc_new(); + if (IS_ERR(conn->meta_proc)) { + ret = PTR_ERR(conn->meta_proc); + conn->meta_proc = NULL; + goto exit_unref; + } + + ret = kdbus_meta_proc_collect(conn->meta_proc, + KDBUS_ATTACH_CREDS | + KDBUS_ATTACH_PIDS | + KDBUS_ATTACH_AUXGROUPS | + KDBUS_ATTACH_TID_COMM | + KDBUS_ATTACH_PID_COMM | + KDBUS_ATTACH_EXE | + KDBUS_ATTACH_CMDLINE | + KDBUS_ATTACH_CGROUP | + KDBUS_ATTACH_CAPS | + KDBUS_ATTACH_SECLABEL | + KDBUS_ATTACH_AUDIT); + if (ret < 0) + goto exit_unref; + } + + /* + * Account the connection against the current user (UID), or for + * custom endpoints use the anonymous user assigned to the endpoint. + * Note that limits are always accounted against the real UID, not + * the effective UID (cred->user always points to the accounting of + * cred->uid, not cred->euid). + * In case the caller is privileged, we allow changing the accounting + * to the faked user. + */ + if (ep->user) { + conn->user = kdbus_user_ref(ep->user); + } else { + kuid_t uid; + + if (conn->meta_fake && uid_valid(conn->meta_fake->uid) && + conn->privileged) + uid = conn->meta_fake->uid; + else + uid = conn->cred->uid; + + conn->user = kdbus_user_lookup(ep->bus->domain, uid); + if (IS_ERR(conn->user)) { + ret = PTR_ERR(conn->user); + conn->user = NULL; + goto exit_unref; + } + } + + if (atomic_inc_return(&conn->user->connections) > KDBUS_USER_MAX_CONN) { + /* decremented by destructor as conn->user is valid */ + ret = -EMFILE; + goto exit_unref; + } + + bloom_item.size = sizeof(bloom_item); + bloom_item.type = KDBUS_ITEM_BLOOM_PARAMETER; + bloom_item.bloom = bus->bloom; + kdbus_kvec_set(&kvec, &bloom_item, bloom_item.size, &items_size); + + slice = kdbus_pool_slice_alloc(conn->pool, items_size, false); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto exit_unref; + } + + ret = kdbus_pool_slice_copy_kvec(slice, 0, &kvec, 1, items_size); + if (ret < 0) + goto exit_unref; + + kdbus_pool_slice_publish(slice, &hello->offset, &hello->items_size); + kdbus_pool_slice_release(slice); + + return conn; + +exit_unref: + kdbus_pool_slice_release(slice); + kdbus_conn_unref(conn); + return ERR_PTR(ret); +} + +static void __kdbus_conn_free(struct kref *kref) +{ + struct kdbus_conn *conn = container_of(kref, struct kdbus_conn, kref); + + WARN_ON(kdbus_conn_active(conn)); + WARN_ON(delayed_work_pending(&conn->work)); + WARN_ON(!list_empty(&conn->queue.msg_list)); + WARN_ON(!list_empty(&conn->names_list)); + WARN_ON(!list_empty(&conn->reply_list)); + + if (conn->user) { + atomic_dec(&conn->user->connections); + kdbus_user_unref(conn->user); + } + + kdbus_meta_fake_free(conn->meta_fake); + kdbus_meta_proc_unref(conn->meta_proc); + kdbus_match_db_free(conn->match_db); + kdbus_pool_free(conn->pool); + kdbus_ep_unref(conn->ep); + path_put(&conn->root_path); + put_pid(conn->pid); + put_cred(conn->cred); + kfree(conn->description); + kfree(conn->quota); + kfree(conn); +} + +/** + * kdbus_conn_ref() - take a connection reference + * @conn: Connection, may be %NULL + * + * Return: the connection itself + */ +struct kdbus_conn *kdbus_conn_ref(struct kdbus_conn *conn) +{ + if (conn) + kref_get(&conn->kref); + return conn; +} + +/** + * kdbus_conn_unref() - drop a connection reference + * @conn: Connection (may be NULL) + * + * When the last reference is dropped, the connection's internal structure + * is freed. + * + * Return: NULL + */ +struct kdbus_conn *kdbus_conn_unref(struct kdbus_conn *conn) +{ + if (conn) + kref_put(&conn->kref, __kdbus_conn_free); + return NULL; +} + +/** + * kdbus_conn_active() - connection is not disconnected + * @conn: Connection to check + * + * Return true if the connection was not disconnected, yet. Note that a + * connection might be disconnected asynchronously, unless you hold the + * connection lock. If that's not suitable for you, see kdbus_conn_acquire() to + * suppress connection shutdown for a short period. + * + * Return: true if the connection is still active + */ +bool kdbus_conn_active(const struct kdbus_conn *conn) +{ + return atomic_read(&conn->active) >= 0; +} + +/** + * kdbus_conn_acquire() - acquire an active connection reference + * @conn: Connection + * + * Users can close a connection via KDBUS_BYEBYE (or by destroying the + * endpoint/bus/...) at any time. Whenever this happens, we should deny any + * user-visible action on this connection and signal ECONNRESET instead. + * To avoid testing for connection availability everytime you take the + * connection-lock, you can acquire a connection for short periods. + * + * By calling kdbus_conn_acquire(), you gain an "active reference" to the + * connection. You must also hold a regular reference at any time! As long as + * you hold the active-ref, the connection will not be shut down. However, if + * the connection was shut down, you can never acquire an active-ref again. + * + * kdbus_conn_disconnect() disables the connection and then waits for all active + * references to be dropped. It will also wake up any pending operation. + * However, you must not sleep for an indefinite period while holding an + * active-reference. Otherwise, kdbus_conn_disconnect() might stall. If you need + * to sleep for an indefinite period, either release the reference and try to + * acquire it again after waking up, or make kdbus_conn_disconnect() wake up + * your wait-queue. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_conn_acquire(struct kdbus_conn *conn) +{ + if (!atomic_inc_unless_negative(&conn->active)) + return -ECONNRESET; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_acquire_read(&conn->dep_map, 0, 1, _RET_IP_); +#endif + + return 0; +} + +/** + * kdbus_conn_release() - release an active connection reference + * @conn: Connection + * + * This releases an active reference that has been acquired via + * kdbus_conn_acquire(). If the connection was already disabled and this is the + * last active-ref that is dropped, the disconnect-waiter will be woken up and + * properly close the connection. + */ +void kdbus_conn_release(struct kdbus_conn *conn) +{ + int v; + + if (!conn) + return; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_release(&conn->dep_map, 1, _RET_IP_); +#endif + + v = atomic_dec_return(&conn->active); + if (v != KDBUS_CONN_ACTIVE_BIAS) + return; + + wake_up_all(&conn->wait); +} + +static int kdbus_conn_connect(struct kdbus_conn *conn, const char *name) +{ + struct kdbus_ep *ep = conn->ep; + struct kdbus_bus *bus = ep->bus; + int ret; + + if (WARN_ON(atomic_read(&conn->active) != KDBUS_CONN_ACTIVE_NEW)) + return -EALREADY; + + /* make sure the ep-node is active while we add our connection */ + if (!kdbus_node_acquire(&ep->node)) + return -ESHUTDOWN; + + /* lock order: domain -> bus -> ep -> names -> conn */ + mutex_lock(&ep->lock); + down_write(&bus->conn_rwlock); + + /* link into monitor list */ + if (kdbus_conn_is_monitor(conn)) + list_add_tail(&conn->monitor_entry, &bus->monitors_list); + + /* link into bus and endpoint */ + list_add_tail(&conn->ep_entry, &ep->conn_list); + hash_add(bus->conn_hash, &conn->hentry, conn->id); + + /* enable lookups and acquire active ref */ + atomic_set(&conn->active, 1); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_acquire_read(&conn->dep_map, 0, 1, _RET_IP_); +#endif + + up_write(&bus->conn_rwlock); + mutex_unlock(&ep->lock); + + kdbus_node_release(&ep->node); + + /* + * Notify subscribers about the new active connection, unless it is + * a monitor. Monitors are invisible on the bus, can't be addressed + * directly, and won't cause any notifications. + */ + if (!kdbus_conn_is_monitor(conn)) { + ret = kdbus_notify_id_change(bus, KDBUS_ITEM_ID_ADD, + conn->id, conn->flags); + if (ret < 0) + goto exit_disconnect; + } + + if (kdbus_conn_is_activator(conn)) { + u64 flags = KDBUS_NAME_ACTIVATOR; + + if (WARN_ON(!name)) { + ret = -EINVAL; + goto exit_disconnect; + } + + ret = kdbus_name_acquire(bus->name_registry, conn, name, + flags, NULL); + if (ret < 0) + goto exit_disconnect; + } + + kdbus_conn_release(conn); + kdbus_notify_flush(bus); + return 0; + +exit_disconnect: + kdbus_conn_release(conn); + kdbus_conn_disconnect(conn, false); + return ret; +} + +/** + * kdbus_conn_disconnect() - disconnect a connection + * @conn: The connection to disconnect + * @ensure_queue_empty: Flag to indicate if the call should fail in + * case the connection's message list is not + * empty + * + * If @ensure_msg_list_empty is true, and the connection has pending messages, + * -EBUSY is returned. + * + * Return: 0 on success, negative errno on failure + */ +int kdbus_conn_disconnect(struct kdbus_conn *conn, bool ensure_queue_empty) +{ + struct kdbus_queue_entry *entry, *tmp; + struct kdbus_bus *bus = conn->ep->bus; + struct kdbus_reply *r, *r_tmp; + struct kdbus_conn *c; + int i, v; + + mutex_lock(&conn->lock); + v = atomic_read(&conn->active); + if (v == KDBUS_CONN_ACTIVE_NEW) { + /* was never connected */ + mutex_unlock(&conn->lock); + return 0; + } + if (v < 0) { + /* already dead */ + mutex_unlock(&conn->lock); + return -ECONNRESET; + } + if (ensure_queue_empty && !list_empty(&conn->queue.msg_list)) { + /* still busy */ + mutex_unlock(&conn->lock); + return -EBUSY; + } + + atomic_add(KDBUS_CONN_ACTIVE_BIAS, &conn->active); + mutex_unlock(&conn->lock); + + wake_up_interruptible(&conn->wait); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_acquire(&conn->dep_map, 0, 0, _RET_IP_); + if (atomic_read(&conn->active) != KDBUS_CONN_ACTIVE_BIAS) + lock_contended(&conn->dep_map, _RET_IP_); +#endif + + wait_event(conn->wait, + atomic_read(&conn->active) == KDBUS_CONN_ACTIVE_BIAS); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lock_acquired(&conn->dep_map, _RET_IP_); + rwsem_release(&conn->dep_map, 1, _RET_IP_); +#endif + + cancel_delayed_work_sync(&conn->work); + kdbus_policy_remove_owner(&conn->ep->bus->policy_db, conn); + + /* lock order: domain -> bus -> ep -> names -> conn */ + mutex_lock(&conn->ep->lock); + down_write(&bus->conn_rwlock); + + /* remove from bus and endpoint */ + hash_del(&conn->hentry); + list_del(&conn->monitor_entry); + list_del(&conn->ep_entry); + + up_write(&bus->conn_rwlock); + mutex_unlock(&conn->ep->lock); + + /* + * Remove all names associated with this connection; this possibly + * moves queued messages back to the activator connection. + */ + kdbus_name_release_all(bus->name_registry, conn); + + /* if we die while other connections wait for our reply, notify them */ + mutex_lock(&conn->lock); + list_for_each_entry_safe(entry, tmp, &conn->queue.msg_list, entry) { + if (entry->reply) + kdbus_notify_reply_dead(bus, + entry->reply->reply_dst->id, + entry->reply->cookie); + kdbus_queue_entry_free(entry); + } + + list_for_each_entry_safe(r, r_tmp, &conn->reply_list, entry) + kdbus_reply_unlink(r); + mutex_unlock(&conn->lock); + + /* lock order: domain -> bus -> ep -> names -> conn */ + down_read(&bus->conn_rwlock); + hash_for_each(bus->conn_hash, i, c, hentry) { + mutex_lock(&c->lock); + list_for_each_entry_safe(r, r_tmp, &c->reply_list, entry) { + if (r->reply_src != conn) + continue; + + if (r->sync) + kdbus_sync_reply_wakeup(r, -EPIPE); + else + /* send a 'connection dead' notification */ + kdbus_notify_reply_dead(bus, c->id, r->cookie); + + kdbus_reply_unlink(r); + } + mutex_unlock(&c->lock); + } + up_read(&bus->conn_rwlock); + + if (!kdbus_conn_is_monitor(conn)) + kdbus_notify_id_change(bus, KDBUS_ITEM_ID_REMOVE, + conn->id, conn->flags); + + kdbus_notify_flush(bus); + + return 0; +} + +/** + * kdbus_conn_has_name() - check if a connection owns a name + * @conn: Connection + * @name: Well-know name to check for + * + * The caller must hold the registry lock of conn->ep->bus. + * + * Return: true if the name is currently owned by the connection + */ +bool kdbus_conn_has_name(struct kdbus_conn *conn, const char *name) +{ + struct kdbus_name_owner *owner; + + lockdep_assert_held(&conn->ep->bus->name_registry->rwlock); + + list_for_each_entry(owner, &conn->names_list, conn_entry) + if (!(owner->flags & KDBUS_NAME_IN_QUEUE) && + !strcmp(name, owner->name->name)) + return true; + + return false; +} + +struct kdbus_quota { + u32 memory; + u16 msgs; + u8 fds; +}; + +/** + * kdbus_conn_quota_inc() - increase quota accounting + * @c: connection owning the quota tracking + * @u: user to account for (or NULL for kernel accounting) + * @memory: size of memory to account for + * @fds: number of FDs to account for + * + * This call manages the quotas on resource @c. That is, it's used if other + * users want to use the resources of connection @c, which so far only concerns + * the receive queue of the destination. + * + * This increases the quota-accounting for user @u by @memory bytes and @fds + * file descriptors. If the user has already reached the quota limits, this call + * will not do any accounting but return a negative error code indicating the + * failure. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_conn_quota_inc(struct kdbus_conn *c, struct kdbus_user *u, + size_t memory, size_t fds) +{ + struct kdbus_quota *quota; + size_t available, accounted; + unsigned int id; + + /* + * Pool Layout: + * 50% of a pool is always owned by the connection. It is reserved for + * kernel queries, handling received messages and other tasks that are + * under control of the pool owner. The other 50% of the pool are used + * as incoming queue. + * As we optionally support user-space based policies, we need fair + * allocation schemes. Furthermore, resource utilization should be + * maximized, so only minimal resources stay reserved. However, we need + * to adapt to a dynamic number of users, as we cannot know how many + * users will talk to a connection. Therefore, the current allocation + * works like this: + * We limit the number of bytes in a destination's pool per sending + * user. The space available for a user is 33% of the unused pool space + * (whereas the space used by the user itself is also treated as + * 'unused'). This way, we favor users coming first, but keep enough + * pool space available for any following users. Given that messages are + * dequeued in FIFO order, this should balance nicely if the number of + * users grows. At the same time, this algorithm guarantees that the + * space available to a connection is reduced dynamically, the more + * concurrent users talk to a connection. + */ + + /* per user-accounting is expensive, so we keep state small */ + BUILD_BUG_ON(sizeof(quota->memory) != 4); + BUILD_BUG_ON(sizeof(quota->msgs) != 2); + BUILD_BUG_ON(sizeof(quota->fds) != 1); + BUILD_BUG_ON(KDBUS_CONN_MAX_MSGS > U16_MAX); + BUILD_BUG_ON(KDBUS_CONN_MAX_FDS_PER_USER > U8_MAX); + + id = u ? u->id : KDBUS_USER_KERNEL_ID; + if (id >= c->n_quota) { + unsigned int users; + + users = max(KDBUS_ALIGN8(id) + 8, id); + quota = krealloc(c->quota, users * sizeof(*quota), + GFP_KERNEL | __GFP_ZERO); + if (!quota) + return -ENOMEM; + + c->n_quota = users; + c->quota = quota; + } + + quota = &c->quota[id]; + kdbus_pool_accounted(c->pool, &available, &accounted); + + /* half the pool is _always_ reserved for the pool owner */ + available /= 2; + + /* + * Pool owner slices are un-accounted slices; they can claim more + * than 50% of the queue. However, the slices we're dealing with here + * belong to the incoming queue, hence they are 'accounted' slices + * to which the 50%-limit applies. + */ + if (available < accounted) + return -ENOBUFS; + + /* 1/3 of the remaining space (including your own memory) */ + available = (available - accounted + quota->memory) / 3; + + if (available < quota->memory || + available - quota->memory < memory || + quota->memory + memory > U32_MAX) + return -ENOBUFS; + if (quota->msgs >= KDBUS_CONN_MAX_MSGS) + return -ENOBUFS; + if (quota->fds + fds < quota->fds || + quota->fds + fds > KDBUS_CONN_MAX_FDS_PER_USER) + return -EMFILE; + + quota->memory += memory; + quota->fds += fds; + ++quota->msgs; + return 0; +} + +/** + * kdbus_conn_quota_dec() - decrease quota accounting + * @c: connection owning the quota tracking + * @u: user which was accounted for (or NULL for kernel accounting) + * @memory: size of memory which was accounted for + * @fds: number of FDs which were accounted for + * + * This does the reverse of kdbus_conn_quota_inc(). You have to release any + * accounted resources that you called kdbus_conn_quota_inc() for. However, you + * must not call kdbus_conn_quota_dec() if the accounting failed (that is, + * kdbus_conn_quota_inc() failed). + */ +void kdbus_conn_quota_dec(struct kdbus_conn *c, struct kdbus_user *u, + size_t memory, size_t fds) +{ + struct kdbus_quota *quota; + unsigned int id; + + id = u ? u->id : KDBUS_USER_KERNEL_ID; + if (WARN_ON(id >= c->n_quota)) + return; + + quota = &c->quota[id]; + + if (!WARN_ON(quota->msgs == 0)) + --quota->msgs; + if (!WARN_ON(quota->memory < memory)) + quota->memory -= memory; + if (!WARN_ON(quota->fds < fds)) + quota->fds -= fds; +} + +/** + * kdbus_conn_lost_message() - handle lost messages + * @c: connection that lost a message + * + * kdbus is reliable. That means, we try hard to never lose messages. However, + * memory is limited, so we cannot rely on transmissions to never fail. + * Therefore, we use quota-limits to let callers know if their unicast message + * cannot be transmitted to a peer. This works fine for unicasts, but for + * broadcasts we cannot make the caller handle the transmission failure. + * Instead, we must let the destination know that it couldn't receive a + * broadcast. + * As this is an unlikely scenario, we keep it simple. A single lost-counter + * remembers the number of lost messages since the last call to RECV. The next + * message retrieval will notify the connection that it lost messages since the + * last message retrieval and thus should resync its state. + */ +void kdbus_conn_lost_message(struct kdbus_conn *c) +{ + if (atomic_inc_return(&c->lost_count) == 1) + wake_up_interruptible(&c->wait); +} + +/* Callers should take the conn_dst lock */ +static struct kdbus_queue_entry * +kdbus_conn_entry_make(struct kdbus_conn *conn_src, + struct kdbus_conn *conn_dst, + struct kdbus_staging *staging) +{ + /* The remote connection was disconnected */ + if (!kdbus_conn_active(conn_dst)) + return ERR_PTR(-ECONNRESET); + + /* + * If the connection does not accept file descriptors but the message + * has some attached, refuse it. + * + * If this is a monitor connection, accept the message. In that + * case, all file descriptors will be set to -1 at receive time. + */ + if (!kdbus_conn_is_monitor(conn_dst) && + !(conn_dst->flags & KDBUS_HELLO_ACCEPT_FD) && + staging->gaps && staging->gaps->n_fds > 0) + return ERR_PTR(-ECOMM); + + return kdbus_queue_entry_new(conn_src, conn_dst, staging); +} + +/* + * Synchronously responding to a message, allocate a queue entry + * and attach it to the reply tracking object. + * The connection's queue will never get to see it. + */ +static int kdbus_conn_entry_sync_attach(struct kdbus_conn *conn_dst, + struct kdbus_staging *staging, + struct kdbus_reply *reply_wake) +{ + struct kdbus_queue_entry *entry; + int remote_ret, ret = 0; + + mutex_lock(&reply_wake->reply_dst->lock); + + /* + * If we are still waiting then proceed, allocate a queue + * entry and attach it to the reply object + */ + if (reply_wake->waiting) { + entry = kdbus_conn_entry_make(reply_wake->reply_src, conn_dst, + staging); + if (IS_ERR(entry)) + ret = PTR_ERR(entry); + else + /* Attach the entry to the reply object */ + reply_wake->queue_entry = entry; + } else { + ret = -ECONNRESET; + } + + /* + * Update the reply object and wake up remote peer only + * on appropriate return codes + * + * * -ECOMM: if the replying connection failed with -ECOMM + * then wakeup remote peer with -EREMOTEIO + * + * We do this to differenciate between -ECOMM errors + * from the original sender perspective: + * -ECOMM error during the sync send and + * -ECOMM error during the sync reply, this last + * one is rewritten to -EREMOTEIO + * + * * Wake up on all other return codes. + */ + remote_ret = ret; + + if (ret == -ECOMM) + remote_ret = -EREMOTEIO; + + kdbus_sync_reply_wakeup(reply_wake, remote_ret); + kdbus_reply_unlink(reply_wake); + mutex_unlock(&reply_wake->reply_dst->lock); + + return ret; +} + +/** + * kdbus_conn_entry_insert() - enqueue a message into the receiver's pool + * @conn_src: The sending connection + * @conn_dst: The connection to queue into + * @staging: Message to send + * @reply: The reply tracker to attach to the queue entry + * @name: Destination name this msg is sent to, or NULL + * + * Return: 0 on success. negative error otherwise. + */ +int kdbus_conn_entry_insert(struct kdbus_conn *conn_src, + struct kdbus_conn *conn_dst, + struct kdbus_staging *staging, + struct kdbus_reply *reply, + const struct kdbus_name_entry *name) +{ + struct kdbus_queue_entry *entry; + int ret; + + kdbus_conn_lock2(conn_src, conn_dst); + + entry = kdbus_conn_entry_make(conn_src, conn_dst, staging); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto exit_unlock; + } + + if (reply) { + kdbus_reply_link(reply); + if (!reply->sync) + schedule_delayed_work(&conn_src->work, 0); + } + + /* + * Record the sequence number of the registered name; it will + * be remembered by the queue, in case messages addressed to a + * name need to be moved from or to an activator. + */ + if (name) + entry->dst_name_id = name->name_id; + + kdbus_queue_entry_enqueue(entry, reply); + wake_up_interruptible(&conn_dst->wait); + + ret = 0; + +exit_unlock: + kdbus_conn_unlock2(conn_src, conn_dst); + return ret; +} + +static int kdbus_conn_wait_reply(struct kdbus_conn *conn_src, + struct kdbus_cmd_send *cmd_send, + struct file *ioctl_file, + struct file *cancel_fd, + struct kdbus_reply *reply_wait, + ktime_t expire) +{ + struct kdbus_queue_entry *entry; + struct poll_wqueues pwq = {}; + int ret; + + if (WARN_ON(!reply_wait)) + return -EIO; + + /* + * Block until the reply arrives. reply_wait is left untouched + * by the timeout scans that might be conducted for other, + * asynchronous replies of conn_src. + */ + + poll_initwait(&pwq); + poll_wait(ioctl_file, &conn_src->wait, &pwq.pt); + + for (;;) { + /* + * Any of the following conditions will stop our synchronously + * blocking SEND command: + * + * a) The origin sender closed its connection + * b) The remote peer answered, setting reply_wait->waiting = 0 + * c) The cancel FD was written to + * d) A signal was received + * e) The specified timeout was reached, and none of the above + * conditions kicked in. + */ + + /* + * We have already acquired an active reference when + * entering here, but another thread may call + * KDBUS_CMD_BYEBYE which does not acquire an active + * reference, therefore kdbus_conn_disconnect() will + * not wait for us. + */ + if (!kdbus_conn_active(conn_src)) { + ret = -ECONNRESET; + break; + } + + /* + * After the replying peer unset the waiting variable + * it will wake up us. + */ + if (!reply_wait->waiting) { + ret = reply_wait->err; + break; + } + + if (cancel_fd) { + unsigned int r; + + r = cancel_fd->f_op->poll(cancel_fd, &pwq.pt); + if (r & POLLIN) { + ret = -ECANCELED; + break; + } + } + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + if (!poll_schedule_timeout(&pwq, TASK_INTERRUPTIBLE, + &expire, 0)) { + ret = -ETIMEDOUT; + break; + } + + /* + * Reset the poll worker func, so the waitqueues are not + * added to the poll table again. We just reuse what we've + * collected earlier for further iterations. + */ + init_poll_funcptr(&pwq.pt, NULL); + } + + poll_freewait(&pwq); + + if (ret == -EINTR) { + /* + * Interrupted system call. Unref the reply object, and pass + * the return value down the chain. Mark the reply as + * interrupted, so the cleanup work can remove it, but do not + * unlink it from the list. Once the syscall restarts, we'll + * pick it up and wait on it again. + */ + mutex_lock(&conn_src->lock); + reply_wait->interrupted = true; + schedule_delayed_work(&conn_src->work, 0); + mutex_unlock(&conn_src->lock); + + return -ERESTARTSYS; + } + + mutex_lock(&conn_src->lock); + reply_wait->waiting = false; + entry = reply_wait->queue_entry; + if (entry) { + ret = kdbus_queue_entry_install(entry, + &cmd_send->reply.return_flags, + true); + kdbus_pool_slice_publish(entry->slice, &cmd_send->reply.offset, + &cmd_send->reply.msg_size); + kdbus_queue_entry_free(entry); + } + kdbus_reply_unlink(reply_wait); + mutex_unlock(&conn_src->lock); + + return ret; +} + +static int kdbus_pin_dst(struct kdbus_bus *bus, + struct kdbus_staging *staging, + struct kdbus_name_entry **out_name, + struct kdbus_conn **out_dst) +{ + const struct kdbus_msg *msg = staging->msg; + struct kdbus_name_owner *owner = NULL; + struct kdbus_name_entry *name = NULL; + struct kdbus_conn *dst = NULL; + int ret; + + lockdep_assert_held(&bus->name_registry->rwlock); + + if (!staging->dst_name) { + dst = kdbus_bus_find_conn_by_id(bus, msg->dst_id); + if (!dst) + return -ENXIO; + + if (!kdbus_conn_is_ordinary(dst)) { + ret = -ENXIO; + goto error; + } + } else { + name = kdbus_name_lookup_unlocked(bus->name_registry, + staging->dst_name); + if (name) + owner = kdbus_name_get_owner(name); + if (!owner) + return -ESRCH; + + /* + * If both a name and a connection ID are given as destination + * of a message, check that the currently owning connection of + * the name matches the specified ID. + * This way, we allow userspace to send the message to a + * specific connection by ID only if the connection currently + * owns the given name. + */ + if (msg->dst_id != KDBUS_DST_ID_NAME && + msg->dst_id != owner->conn->id) + return -EREMCHG; + + if ((msg->flags & KDBUS_MSG_NO_AUTO_START) && + kdbus_conn_is_activator(owner->conn)) + return -EADDRNOTAVAIL; + + dst = kdbus_conn_ref(owner->conn); + } + + *out_name = name; + *out_dst = dst; + return 0; + +error: + kdbus_conn_unref(dst); + return ret; +} + +static int kdbus_conn_reply(struct kdbus_conn *src, + struct kdbus_staging *staging) +{ + const struct kdbus_msg *msg = staging->msg; + struct kdbus_name_entry *name = NULL; + struct kdbus_reply *reply, *wake = NULL; + struct kdbus_conn *dst = NULL; + struct kdbus_bus *bus = src->ep->bus; + int ret; + + if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) || + WARN_ON(msg->flags & KDBUS_MSG_EXPECT_REPLY) || + WARN_ON(msg->flags & KDBUS_MSG_SIGNAL)) + return -EINVAL; + + /* name-registry must be locked for lookup *and* collecting data */ + down_read(&bus->name_registry->rwlock); + + /* find and pin destination */ + + ret = kdbus_pin_dst(bus, staging, &name, &dst); + if (ret < 0) + goto exit; + + mutex_lock(&dst->lock); + reply = kdbus_reply_find(src, dst, msg->cookie_reply); + if (reply) { + if (reply->sync) + wake = kdbus_reply_ref(reply); + kdbus_reply_unlink(reply); + } + mutex_unlock(&dst->lock); + + if (!reply) { + ret = -EBADSLT; + goto exit; + } + + /* send message */ + + kdbus_bus_eavesdrop(bus, src, staging); + + if (wake) + ret = kdbus_conn_entry_sync_attach(dst, staging, wake); + else + ret = kdbus_conn_entry_insert(src, dst, staging, NULL, name); + +exit: + up_read(&bus->name_registry->rwlock); + kdbus_reply_unref(wake); + kdbus_conn_unref(dst); + return ret; +} + +static struct kdbus_reply *kdbus_conn_call(struct kdbus_conn *src, + struct kdbus_staging *staging, + ktime_t exp) +{ + const struct kdbus_msg *msg = staging->msg; + struct kdbus_name_entry *name = NULL; + struct kdbus_reply *wait = NULL; + struct kdbus_conn *dst = NULL; + struct kdbus_bus *bus = src->ep->bus; + int ret; + + if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) || + WARN_ON(msg->flags & KDBUS_MSG_SIGNAL) || + WARN_ON(!(msg->flags & KDBUS_MSG_EXPECT_REPLY))) + return ERR_PTR(-EINVAL); + + /* resume previous wait-context, if available */ + + mutex_lock(&src->lock); + wait = kdbus_reply_find(NULL, src, msg->cookie); + if (wait) { + if (wait->interrupted) { + kdbus_reply_ref(wait); + wait->interrupted = false; + } else { + wait = NULL; + } + } + mutex_unlock(&src->lock); + + if (wait) + return wait; + + if (ktime_compare(ktime_get(), exp) >= 0) + return ERR_PTR(-ETIMEDOUT); + + /* name-registry must be locked for lookup *and* collecting data */ + down_read(&bus->name_registry->rwlock); + + /* find and pin destination */ + + ret = kdbus_pin_dst(bus, staging, &name, &dst); + if (ret < 0) + goto exit; + + if (!kdbus_conn_policy_talk(src, current_cred(), dst)) { + ret = -EPERM; + goto exit; + } + + wait = kdbus_reply_new(dst, src, msg, name, true); + if (IS_ERR(wait)) { + ret = PTR_ERR(wait); + wait = NULL; + goto exit; + } + + /* send message */ + + kdbus_bus_eavesdrop(bus, src, staging); + + ret = kdbus_conn_entry_insert(src, dst, staging, wait, name); + if (ret < 0) + goto exit; + + ret = 0; + +exit: + up_read(&bus->name_registry->rwlock); + if (ret < 0) { + kdbus_reply_unref(wait); + wait = ERR_PTR(ret); + } + kdbus_conn_unref(dst); + return wait; +} + +static int kdbus_conn_unicast(struct kdbus_conn *src, + struct kdbus_staging *staging) +{ + const struct kdbus_msg *msg = staging->msg; + struct kdbus_name_entry *name = NULL; + struct kdbus_reply *wait = NULL; + struct kdbus_conn *dst = NULL; + struct kdbus_bus *bus = src->ep->bus; + bool is_signal = (msg->flags & KDBUS_MSG_SIGNAL); + int ret = 0; + + if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) || + WARN_ON(!(msg->flags & KDBUS_MSG_EXPECT_REPLY) && + msg->cookie_reply != 0)) + return -EINVAL; + + /* name-registry must be locked for lookup *and* collecting data */ + down_read(&bus->name_registry->rwlock); + + /* find and pin destination */ + + ret = kdbus_pin_dst(bus, staging, &name, &dst); + if (ret < 0) + goto exit; + + if (is_signal) { + /* like broadcasts we eavesdrop even if the msg is dropped */ + kdbus_bus_eavesdrop(bus, src, staging); + + /* drop silently if peer is not interested or not privileged */ + if (!kdbus_match_db_match_msg(dst->match_db, src, staging) || + !kdbus_conn_policy_talk(dst, NULL, src)) + goto exit; + } else if (!kdbus_conn_policy_talk(src, current_cred(), dst)) { + ret = -EPERM; + goto exit; + } else if (msg->flags & KDBUS_MSG_EXPECT_REPLY) { + wait = kdbus_reply_new(dst, src, msg, name, false); + if (IS_ERR(wait)) { + ret = PTR_ERR(wait); + wait = NULL; + goto exit; + } + } + + /* send message */ + + if (!is_signal) + kdbus_bus_eavesdrop(bus, src, staging); + + ret = kdbus_conn_entry_insert(src, dst, staging, wait, name); + if (ret < 0 && !is_signal) + goto exit; + + /* signals are treated like broadcasts, recv-errors are ignored */ + ret = 0; + +exit: + up_read(&bus->name_registry->rwlock); + kdbus_reply_unref(wait); + kdbus_conn_unref(dst); + return ret; +} + +/** + * kdbus_conn_move_messages() - move messages from one connection to another + * @conn_dst: Connection to copy to + * @conn_src: Connection to copy from + * @name_id: Filter for the sequence number of the registered + * name, 0 means no filtering. + * + * Move all messages from one connection to another. This is used when + * an implementer connection is taking over/giving back a well-known name + * from/to an activator connection. + */ +void kdbus_conn_move_messages(struct kdbus_conn *conn_dst, + struct kdbus_conn *conn_src, + u64 name_id) +{ + struct kdbus_queue_entry *e, *e_tmp; + struct kdbus_reply *r, *r_tmp; + struct kdbus_bus *bus; + struct kdbus_conn *c; + LIST_HEAD(msg_list); + int i, ret = 0; + + if (WARN_ON(conn_src == conn_dst)) + return; + + bus = conn_src->ep->bus; + + /* lock order: domain -> bus -> ep -> names -> conn */ + down_read(&bus->conn_rwlock); + hash_for_each(bus->conn_hash, i, c, hentry) { + if (c == conn_src || c == conn_dst) + continue; + + mutex_lock(&c->lock); + list_for_each_entry_safe(r, r_tmp, &c->reply_list, entry) { + if (r->reply_src != conn_src) + continue; + + /* filter messages for a specific name */ + if (name_id > 0 && r->name_id != name_id) + continue; + + kdbus_conn_unref(r->reply_src); + r->reply_src = kdbus_conn_ref(conn_dst); + } + mutex_unlock(&c->lock); + } + up_read(&bus->conn_rwlock); + + kdbus_conn_lock2(conn_src, conn_dst); + list_for_each_entry_safe(e, e_tmp, &conn_src->queue.msg_list, entry) { + /* filter messages for a specific name */ + if (name_id > 0 && e->dst_name_id != name_id) + continue; + + if (!(conn_dst->flags & KDBUS_HELLO_ACCEPT_FD) && + e->gaps && e->gaps->n_fds > 0) { + kdbus_conn_lost_message(conn_dst); + kdbus_queue_entry_free(e); + continue; + } + + ret = kdbus_queue_entry_move(e, conn_dst); + if (ret < 0) { + kdbus_conn_lost_message(conn_dst); + kdbus_queue_entry_free(e); + continue; + } + } + kdbus_conn_unlock2(conn_src, conn_dst); + + /* wake up poll() */ + wake_up_interruptible(&conn_dst->wait); +} + +/* query the policy-database for all names of @whom */ +static bool kdbus_conn_policy_query_all(struct kdbus_conn *conn, + const struct cred *conn_creds, + struct kdbus_policy_db *db, + struct kdbus_conn *whom, + unsigned int access) +{ + struct kdbus_name_owner *owner; + bool pass = false; + int res; + + lockdep_assert_held(&conn->ep->bus->name_registry->rwlock); + + down_read(&db->entries_rwlock); + mutex_lock(&whom->lock); + + list_for_each_entry(owner, &whom->names_list, conn_entry) { + if (owner->flags & KDBUS_NAME_IN_QUEUE) + continue; + + res = kdbus_policy_query_unlocked(db, + conn_creds ? : conn->cred, + owner->name->name, + kdbus_strhash(owner->name->name)); + if (res >= (int)access) { + pass = true; + break; + } + } + + mutex_unlock(&whom->lock); + up_read(&db->entries_rwlock); + + return pass; +} + +/** + * kdbus_conn_policy_own_name() - verify a connection can own the given name + * @conn: Connection + * @conn_creds: Credentials of @conn to use for policy check + * @name: Name + * + * This verifies that @conn is allowed to acquire the well-known name @name. + * + * Return: true if allowed, false if not. + */ +bool kdbus_conn_policy_own_name(struct kdbus_conn *conn, + const struct cred *conn_creds, + const char *name) +{ + unsigned int hash = kdbus_strhash(name); + int res; + + if (!conn_creds) + conn_creds = conn->cred; + + if (conn->ep->user) { + res = kdbus_policy_query(&conn->ep->policy_db, conn_creds, + name, hash); + if (res < KDBUS_POLICY_OWN) + return false; + } + + if (conn->owner) + return true; + + res = kdbus_policy_query(&conn->ep->bus->policy_db, conn_creds, + name, hash); + return res >= KDBUS_POLICY_OWN; +} + +/** + * kdbus_conn_policy_talk() - verify a connection can talk to a given peer + * @conn: Connection that tries to talk + * @conn_creds: Credentials of @conn to use for policy check + * @to: Connection that is talked to + * + * This verifies that @conn is allowed to talk to @to. + * + * Return: true if allowed, false if not. + */ +bool kdbus_conn_policy_talk(struct kdbus_conn *conn, + const struct cred *conn_creds, + struct kdbus_conn *to) +{ + if (!conn_creds) + conn_creds = conn->cred; + + if (conn->ep->user && + !kdbus_conn_policy_query_all(conn, conn_creds, &conn->ep->policy_db, + to, KDBUS_POLICY_TALK)) + return false; + + if (conn->owner) + return true; + if (uid_eq(conn_creds->euid, to->cred->uid)) + return true; + + return kdbus_conn_policy_query_all(conn, conn_creds, + &conn->ep->bus->policy_db, to, + KDBUS_POLICY_TALK); +} + +/** + * kdbus_conn_policy_see_name_unlocked() - verify a connection can see a given + * name + * @conn: Connection + * @conn_creds: Credentials of @conn to use for policy check + * @name: Name + * + * This verifies that @conn is allowed to see the well-known name @name. Caller + * must hold policy-lock. + * + * Return: true if allowed, false if not. + */ +bool kdbus_conn_policy_see_name_unlocked(struct kdbus_conn *conn, + const struct cred *conn_creds, + const char *name) +{ + int res; + + /* + * By default, all names are visible on a bus. SEE policies can only be + * installed on custom endpoints, where by default no name is visible. + */ + if (!conn->ep->user) + return true; + + res = kdbus_policy_query_unlocked(&conn->ep->policy_db, + conn_creds ? : conn->cred, + name, kdbus_strhash(name)); + return res >= KDBUS_POLICY_SEE; +} + +static bool kdbus_conn_policy_see_name(struct kdbus_conn *conn, + const struct cred *conn_creds, + const char *name) +{ + bool res; + + down_read(&conn->ep->policy_db.entries_rwlock); + res = kdbus_conn_policy_see_name_unlocked(conn, conn_creds, name); + up_read(&conn->ep->policy_db.entries_rwlock); + + return res; +} + +static bool kdbus_conn_policy_see(struct kdbus_conn *conn, + const struct cred *conn_creds, + struct kdbus_conn *whom) +{ + /* + * By default, all names are visible on a bus, so a connection can + * always see other connections. SEE policies can only be installed on + * custom endpoints, where by default no name is visible and we hide + * peers from each other, unless you see at least _one_ name of the + * peer. + */ + return !conn->ep->user || + kdbus_conn_policy_query_all(conn, conn_creds, + &conn->ep->policy_db, whom, + KDBUS_POLICY_SEE); +} + +/** + * kdbus_conn_policy_see_notification() - verify a connection is allowed to + * receive a given kernel notification + * @conn: Connection + * @conn_creds: Credentials of @conn to use for policy check + * @msg: Notification message + * + * This checks whether @conn is allowed to see the kernel notification. + * + * Return: true if allowed, false if not. + */ +bool kdbus_conn_policy_see_notification(struct kdbus_conn *conn, + const struct cred *conn_creds, + const struct kdbus_msg *msg) +{ + /* + * Depending on the notification type, broadcasted kernel notifications + * have to be filtered: + * + * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE}: This notification is forwarded + * to a peer if, and only if, that peer can see the name this + * notification is for. + * + * KDBUS_ITEM_ID_{ADD,REMOVE}: Notifications for ID changes are + * broadcast to everyone, to allow tracking peers. + */ + + switch (msg->items[0].type) { + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_REMOVE: + case KDBUS_ITEM_NAME_CHANGE: + return kdbus_conn_policy_see_name(conn, conn_creds, + msg->items[0].name_change.name); + + case KDBUS_ITEM_ID_ADD: + case KDBUS_ITEM_ID_REMOVE: + return true; + + default: + WARN(1, "Invalid type for notification broadcast: %llu\n", + (unsigned long long)msg->items[0].type); + return false; + } +} + +/** + * kdbus_cmd_hello() - handle KDBUS_CMD_HELLO + * @ep: Endpoint to operate on + * @file: File this connection is opened on + * @argp: Command payload + * + * Return: NULL or newly created connection on success, ERR_PTR on failure. + */ +struct kdbus_conn *kdbus_cmd_hello(struct kdbus_ep *ep, struct file *file, + void __user *argp) +{ + struct kdbus_cmd_hello *cmd; + struct kdbus_conn *c = NULL; + const char *item_name; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_NAME }, + { .type = KDBUS_ITEM_CREDS }, + { .type = KDBUS_ITEM_PIDS }, + { .type = KDBUS_ITEM_SECLABEL }, + { .type = KDBUS_ITEM_CONN_DESCRIPTION }, + { .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_HELLO_ACCEPT_FD | + KDBUS_HELLO_ACTIVATOR | + KDBUS_HELLO_POLICY_HOLDER | + KDBUS_HELLO_MONITOR, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + + item_name = argv[1].item ? argv[1].item->str : NULL; + + c = kdbus_conn_new(ep, file, cmd, item_name, + argv[2].item ? &argv[2].item->creds : NULL, + argv[3].item ? &argv[3].item->pids : NULL, + argv[4].item ? argv[4].item->str : NULL, + argv[5].item ? argv[5].item->str : NULL); + if (IS_ERR(c)) { + ret = PTR_ERR(c); + c = NULL; + goto exit; + } + + ret = kdbus_conn_connect(c, item_name); + if (ret < 0) + goto exit; + + if (kdbus_conn_is_activator(c) || kdbus_conn_is_policy_holder(c)) { + ret = kdbus_conn_acquire(c); + if (ret < 0) + goto exit; + + ret = kdbus_policy_set(&c->ep->bus->policy_db, args.items, + args.items_size, 1, + kdbus_conn_is_policy_holder(c), c); + kdbus_conn_release(c); + if (ret < 0) + goto exit; + } + + if (copy_to_user(argp, cmd, sizeof(*cmd))) + ret = -EFAULT; + +exit: + ret = kdbus_args_clear(&args, ret); + if (ret < 0) { + if (c) { + kdbus_conn_disconnect(c, false); + kdbus_conn_unref(c); + } + return ERR_PTR(ret); + } + return c; +} + +/** + * kdbus_cmd_byebye_unlocked() - handle KDBUS_CMD_BYEBYE + * @conn: connection to operate on + * @argp: command payload + * + * The caller must not hold any active reference to @conn or this will deadlock. + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_byebye_unlocked(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_cmd *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + ret = kdbus_conn_disconnect(conn, true); + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_conn_info() - handle KDBUS_CMD_CONN_INFO + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_conn_info(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_meta_conn *conn_meta = NULL; + struct kdbus_pool_slice *slice = NULL; + struct kdbus_name_entry *entry = NULL; + struct kdbus_name_owner *owner = NULL; + struct kdbus_conn *owner_conn = NULL; + struct kdbus_item *meta_items = NULL; + struct kdbus_info info = {}; + struct kdbus_cmd_info *cmd; + struct kdbus_bus *bus = conn->ep->bus; + struct kvec kvec[3]; + size_t meta_size, cnt = 0; + const char *name; + u64 attach_flags, size = 0; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_NAME }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + /* registry must be held throughout lookup *and* collecting data */ + down_read(&bus->name_registry->rwlock); + + ret = kdbus_sanitize_attach_flags(cmd->attach_flags, &attach_flags); + if (ret < 0) + goto exit; + + name = argv[1].item ? argv[1].item->str : NULL; + + if (name) { + entry = kdbus_name_lookup_unlocked(bus->name_registry, name); + if (entry) + owner = kdbus_name_get_owner(entry); + if (!owner || + !kdbus_conn_policy_see_name(conn, current_cred(), name) || + (cmd->id != 0 && owner->conn->id != cmd->id)) { + /* pretend a name doesn't exist if you cannot see it */ + ret = -ESRCH; + goto exit; + } + + owner_conn = kdbus_conn_ref(owner->conn); + } else if (cmd->id > 0) { + owner_conn = kdbus_bus_find_conn_by_id(bus, cmd->id); + if (!owner_conn || !kdbus_conn_policy_see(conn, current_cred(), + owner_conn)) { + /* pretend an id doesn't exist if you cannot see it */ + ret = -ENXIO; + goto exit; + } + } else { + ret = -EINVAL; + goto exit; + } + + attach_flags &= atomic64_read(&owner_conn->attach_flags_send); + + conn_meta = kdbus_meta_conn_new(); + if (IS_ERR(conn_meta)) { + ret = PTR_ERR(conn_meta); + conn_meta = NULL; + goto exit; + } + + ret = kdbus_meta_conn_collect(conn_meta, owner_conn, 0, attach_flags); + if (ret < 0) + goto exit; + + ret = kdbus_meta_emit(owner_conn->meta_proc, owner_conn->meta_fake, + conn_meta, conn, attach_flags, + &meta_items, &meta_size); + if (ret < 0) + goto exit; + + info.id = owner_conn->id; + info.flags = owner_conn->flags; + + kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &size); + if (meta_size > 0) { + kdbus_kvec_set(&kvec[cnt++], meta_items, meta_size, &size); + cnt += !!kdbus_kvec_pad(&kvec[cnt], &size); + } + + info.size = size; + + slice = kdbus_pool_slice_alloc(conn->pool, size, false); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto exit; + } + + ret = kdbus_pool_slice_copy_kvec(slice, 0, kvec, cnt, size); + if (ret < 0) + goto exit; + + kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->info_size); + + if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) || + kdbus_member_set_user(&cmd->info_size, argp, + typeof(*cmd), info_size)) { + ret = -EFAULT; + goto exit; + } + + ret = 0; + +exit: + up_read(&bus->name_registry->rwlock); + kdbus_pool_slice_release(slice); + kfree(meta_items); + kdbus_meta_conn_unref(conn_meta); + kdbus_conn_unref(owner_conn); + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_update() - handle KDBUS_CMD_UPDATE + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_update(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_item *item_policy; + u64 *item_attach_send = NULL; + u64 *item_attach_recv = NULL; + struct kdbus_cmd *cmd; + u64 attach_send; + u64 attach_recv; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_ATTACH_FLAGS_SEND }, + { .type = KDBUS_ITEM_ATTACH_FLAGS_RECV }, + { .type = KDBUS_ITEM_NAME, .multiple = true }, + { .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + item_attach_send = argv[1].item ? &argv[1].item->data64[0] : NULL; + item_attach_recv = argv[2].item ? &argv[2].item->data64[0] : NULL; + item_policy = argv[3].item ? : argv[4].item; + + if (item_attach_send) { + if (!kdbus_conn_is_ordinary(conn) && + !kdbus_conn_is_monitor(conn)) { + ret = -EOPNOTSUPP; + goto exit; + } + + ret = kdbus_sanitize_attach_flags(*item_attach_send, + &attach_send); + if (ret < 0) + goto exit; + } + + if (item_attach_recv) { + if (!kdbus_conn_is_ordinary(conn) && + !kdbus_conn_is_monitor(conn) && + !kdbus_conn_is_activator(conn)) { + ret = -EOPNOTSUPP; + goto exit; + } + + ret = kdbus_sanitize_attach_flags(*item_attach_recv, + &attach_recv); + if (ret < 0) + goto exit; + } + + if (item_policy && !kdbus_conn_is_policy_holder(conn)) { + ret = -EOPNOTSUPP; + goto exit; + } + + /* now that we verified the input, update the connection */ + + if (item_policy) { + ret = kdbus_policy_set(&conn->ep->bus->policy_db, cmd->items, + KDBUS_ITEMS_SIZE(cmd, items), + 1, true, conn); + if (ret < 0) + goto exit; + } + + if (item_attach_send) + atomic64_set(&conn->attach_flags_send, attach_send); + + if (item_attach_recv) + atomic64_set(&conn->attach_flags_recv, attach_recv); + +exit: + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_send() - handle KDBUS_CMD_SEND + * @conn: connection to operate on + * @f: file this command was called on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_send(struct kdbus_conn *conn, struct file *f, void __user *argp) +{ + struct kdbus_cmd_send *cmd; + struct kdbus_staging *staging = NULL; + struct kdbus_msg *msg = NULL; + struct file *cancel_fd = NULL; + int ret, ret2; + + /* command arguments */ + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_CANCEL_FD }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_SEND_SYNC_REPLY, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + /* message arguments */ + struct kdbus_arg msg_argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_PAYLOAD_VEC, .multiple = true }, + { .type = KDBUS_ITEM_PAYLOAD_MEMFD, .multiple = true }, + { .type = KDBUS_ITEM_FDS }, + { .type = KDBUS_ITEM_BLOOM_FILTER }, + { .type = KDBUS_ITEM_DST_NAME }, + }; + struct kdbus_args msg_args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_MSG_EXPECT_REPLY | + KDBUS_MSG_NO_AUTO_START | + KDBUS_MSG_SIGNAL, + .argv = msg_argv, + .argc = ARRAY_SIZE(msg_argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + /* make sure to parse both, @cmd and @msg on negotiation */ + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret < 0) + goto exit; + else if (ret > 0 && !cmd->msg_address) /* negotiation without msg */ + goto exit; + + ret2 = kdbus_args_parse_msg(&msg_args, KDBUS_PTR(cmd->msg_address), + &msg); + if (ret2 < 0) { /* cannot parse message */ + ret = ret2; + goto exit; + } else if (ret2 > 0 && !ret) { /* msg-negot implies cmd-negot */ + ret = -EINVAL; + goto exit; + } else if (ret > 0) { /* negotiation */ + goto exit; + } + + /* here we parsed both, @cmd and @msg, and neither wants negotiation */ + + cmd->reply.return_flags = 0; + kdbus_pool_publish_empty(conn->pool, &cmd->reply.offset, + &cmd->reply.msg_size); + + if (argv[1].item) { + cancel_fd = fget(argv[1].item->fds[0]); + if (!cancel_fd) { + ret = -EBADF; + goto exit; + } + + if (!cancel_fd->f_op->poll) { + ret = -EINVAL; + goto exit; + } + } + + /* patch-in the source of this message */ + if (msg->src_id > 0 && msg->src_id != conn->id) { + ret = -EINVAL; + goto exit; + } + msg->src_id = conn->id; + + staging = kdbus_staging_new_user(conn->ep->bus, cmd, msg); + if (IS_ERR(staging)) { + ret = PTR_ERR(staging); + staging = NULL; + goto exit; + } + + if (msg->dst_id == KDBUS_DST_ID_BROADCAST) { + down_read(&conn->ep->bus->name_registry->rwlock); + kdbus_bus_broadcast(conn->ep->bus, conn, staging); + up_read(&conn->ep->bus->name_registry->rwlock); + } else if (cmd->flags & KDBUS_SEND_SYNC_REPLY) { + struct kdbus_reply *r; + ktime_t exp; + + exp = ns_to_ktime(msg->timeout_ns); + r = kdbus_conn_call(conn, staging, exp); + if (IS_ERR(r)) { + ret = PTR_ERR(r); + goto exit; + } + + ret = kdbus_conn_wait_reply(conn, cmd, f, cancel_fd, r, exp); + kdbus_reply_unref(r); + if (ret < 0) + goto exit; + } else if ((msg->flags & KDBUS_MSG_EXPECT_REPLY) || + msg->cookie_reply == 0) { + ret = kdbus_conn_unicast(conn, staging); + if (ret < 0) + goto exit; + } else { + ret = kdbus_conn_reply(conn, staging); + if (ret < 0) + goto exit; + } + + if (kdbus_member_set_user(&cmd->reply, argp, typeof(*cmd), reply)) + ret = -EFAULT; + +exit: + if (cancel_fd) + fput(cancel_fd); + kdbus_staging_free(staging); + ret = kdbus_args_clear(&msg_args, ret); + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_recv() - handle KDBUS_CMD_RECV + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_recv(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_queue_entry *entry; + struct kdbus_cmd_recv *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_RECV_PEEK | + KDBUS_RECV_DROP | + KDBUS_RECV_USE_PRIORITY, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn) && + !kdbus_conn_is_monitor(conn) && + !kdbus_conn_is_activator(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + cmd->dropped_msgs = 0; + cmd->msg.return_flags = 0; + kdbus_pool_publish_empty(conn->pool, &cmd->msg.offset, + &cmd->msg.msg_size); + + /* DROP+priority is not realiably, so prevent it */ + if ((cmd->flags & KDBUS_RECV_DROP) && + (cmd->flags & KDBUS_RECV_USE_PRIORITY)) { + ret = -EINVAL; + goto exit; + } + + mutex_lock(&conn->lock); + + entry = kdbus_queue_peek(&conn->queue, cmd->priority, + cmd->flags & KDBUS_RECV_USE_PRIORITY); + if (!entry) { + mutex_unlock(&conn->lock); + ret = -EAGAIN; + } else if (cmd->flags & KDBUS_RECV_DROP) { + struct kdbus_reply *reply = kdbus_reply_ref(entry->reply); + + kdbus_queue_entry_free(entry); + + mutex_unlock(&conn->lock); + + if (reply) { + mutex_lock(&reply->reply_dst->lock); + if (!list_empty(&reply->entry)) { + kdbus_reply_unlink(reply); + if (reply->sync) + kdbus_sync_reply_wakeup(reply, -EPIPE); + else + kdbus_notify_reply_dead(conn->ep->bus, + reply->reply_dst->id, + reply->cookie); + } + mutex_unlock(&reply->reply_dst->lock); + kdbus_notify_flush(conn->ep->bus); + } + + kdbus_reply_unref(reply); + } else { + bool install_fds; + + /* + * PEEK just returns the location of the next message. Do not + * install FDs nor memfds nor anything else. The only + * information of interest should be the message header and + * metadata. Any FD numbers in the payload is undefined for + * PEEK'ed messages. + * Also make sure to never install fds into a connection that + * has refused to receive any. Ordinary connections will not get + * messages with FDs queued (the receiver will get -ECOMM), but + * eavesdroppers might. + */ + install_fds = (conn->flags & KDBUS_HELLO_ACCEPT_FD) && + !(cmd->flags & KDBUS_RECV_PEEK); + + ret = kdbus_queue_entry_install(entry, + &cmd->msg.return_flags, + install_fds); + if (ret < 0) { + mutex_unlock(&conn->lock); + goto exit; + } + + kdbus_pool_slice_publish(entry->slice, &cmd->msg.offset, + &cmd->msg.msg_size); + + if (!(cmd->flags & KDBUS_RECV_PEEK)) + kdbus_queue_entry_free(entry); + + mutex_unlock(&conn->lock); + } + + cmd->dropped_msgs = atomic_xchg(&conn->lost_count, 0); + if (cmd->dropped_msgs > 0) + cmd->return_flags |= KDBUS_RECV_RETURN_DROPPED_MSGS; + + if (kdbus_member_set_user(&cmd->msg, argp, typeof(*cmd), msg) || + kdbus_member_set_user(&cmd->dropped_msgs, argp, typeof(*cmd), + dropped_msgs)) + ret = -EFAULT; + +exit: + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_free() - handle KDBUS_CMD_FREE + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_free(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_cmd_free *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn) && + !kdbus_conn_is_monitor(conn) && + !kdbus_conn_is_activator(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + ret = kdbus_pool_release_offset(conn->pool, cmd->offset); + + return kdbus_args_clear(&args, ret); +} diff -Nur linux-4.2/ipc/kdbus/connection.h linux-4.2-pck/ipc/kdbus/connection.h --- linux-4.2/ipc/kdbus/connection.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/connection.h 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,260 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_CONNECTION_H +#define __KDBUS_CONNECTION_H + +#include +#include +#include +#include + +#include "limits.h" +#include "metadata.h" +#include "pool.h" +#include "queue.h" +#include "util.h" + +#define KDBUS_HELLO_SPECIAL_CONN (KDBUS_HELLO_ACTIVATOR | \ + KDBUS_HELLO_POLICY_HOLDER | \ + KDBUS_HELLO_MONITOR) + +struct kdbus_name_entry; +struct kdbus_quota; +struct kdbus_staging; + +/** + * struct kdbus_conn - connection to a bus + * @kref: Reference count + * @active: Active references to the connection + * @id: Connection ID + * @flags: KDBUS_HELLO_* flags + * @attach_flags_send: KDBUS_ATTACH_* flags for sending + * @attach_flags_recv: KDBUS_ATTACH_* flags for receiving + * @description: Human-readable connection description, used for + * debugging. This field is only set when the + * connection is created. + * @ep: The endpoint this connection belongs to + * @lock: Connection data lock + * @hentry: Entry in ID <-> connection map + * @ep_entry: Entry in endpoint + * @monitor_entry: Entry in monitor, if the connection is a monitor + * @reply_list: List of connections this connection should + * reply to + * @work: Delayed work to handle timeouts + * activator for + * @match_db: Subscription filter to broadcast messages + * @meta_proc: Process metadata of connection creator, or NULL + * @meta_fake: Faked metadata, or NULL + * @pool: The user's buffer to receive messages + * @user: Owner of the connection + * @cred: The credentials of the connection at creation time + * @pid: Pid at creation time + * @root_path: Root path at creation time + * @request_count: Number of pending requests issued by this + * connection that are waiting for replies from + * other peers + * @lost_count: Number of lost broadcast messages + * @wait: Wake up this endpoint + * @queue: The message queue associated with this connection + * @quota: Array of per-user quota indexed by user->id + * @n_quota: Number of elements in quota array + * @names_list: List of well-known names + * @name_count: Number of owned well-known names + * @privileged: Whether this connection is privileged on the domain + * @owner: Owned by the same user as the bus owner + */ +struct kdbus_conn { + struct kref kref; + atomic_t active; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif + u64 id; + u64 flags; + atomic64_t attach_flags_send; + atomic64_t attach_flags_recv; + const char *description; + struct kdbus_ep *ep; + struct mutex lock; + struct hlist_node hentry; + struct list_head ep_entry; + struct list_head monitor_entry; + struct list_head reply_list; + struct delayed_work work; + struct kdbus_match_db *match_db; + struct kdbus_meta_proc *meta_proc; + struct kdbus_meta_fake *meta_fake; + struct kdbus_pool *pool; + struct kdbus_user *user; + const struct cred *cred; + struct pid *pid; + struct path root_path; + atomic_t request_count; + atomic_t lost_count; + wait_queue_head_t wait; + struct kdbus_queue queue; + + struct kdbus_quota *quota; + unsigned int n_quota; + + /* protected by registry->rwlock */ + struct list_head names_list; + unsigned int name_count; + + bool privileged:1; + bool owner:1; +}; + +struct kdbus_conn *kdbus_conn_ref(struct kdbus_conn *conn); +struct kdbus_conn *kdbus_conn_unref(struct kdbus_conn *conn); +bool kdbus_conn_active(const struct kdbus_conn *conn); +int kdbus_conn_acquire(struct kdbus_conn *conn); +void kdbus_conn_release(struct kdbus_conn *conn); +int kdbus_conn_disconnect(struct kdbus_conn *conn, bool ensure_queue_empty); +bool kdbus_conn_has_name(struct kdbus_conn *conn, const char *name); +int kdbus_conn_quota_inc(struct kdbus_conn *c, struct kdbus_user *u, + size_t memory, size_t fds); +void kdbus_conn_quota_dec(struct kdbus_conn *c, struct kdbus_user *u, + size_t memory, size_t fds); +void kdbus_conn_lost_message(struct kdbus_conn *c); +int kdbus_conn_entry_insert(struct kdbus_conn *conn_src, + struct kdbus_conn *conn_dst, + struct kdbus_staging *staging, + struct kdbus_reply *reply, + const struct kdbus_name_entry *name); +void kdbus_conn_move_messages(struct kdbus_conn *conn_dst, + struct kdbus_conn *conn_src, + u64 name_id); + +/* policy */ +bool kdbus_conn_policy_own_name(struct kdbus_conn *conn, + const struct cred *conn_creds, + const char *name); +bool kdbus_conn_policy_talk(struct kdbus_conn *conn, + const struct cred *conn_creds, + struct kdbus_conn *to); +bool kdbus_conn_policy_see_name_unlocked(struct kdbus_conn *conn, + const struct cred *curr_creds, + const char *name); +bool kdbus_conn_policy_see_notification(struct kdbus_conn *conn, + const struct cred *curr_creds, + const struct kdbus_msg *msg); + +/* command dispatcher */ +struct kdbus_conn *kdbus_cmd_hello(struct kdbus_ep *ep, struct file *file, + void __user *argp); +int kdbus_cmd_byebye_unlocked(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_conn_info(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_update(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_send(struct kdbus_conn *conn, struct file *f, void __user *argp); +int kdbus_cmd_recv(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_free(struct kdbus_conn *conn, void __user *argp); + +/** + * kdbus_conn_is_ordinary() - Check if connection is ordinary + * @conn: The connection to check + * + * Return: Non-zero if the connection is an ordinary connection + */ +static inline int kdbus_conn_is_ordinary(const struct kdbus_conn *conn) +{ + return !(conn->flags & KDBUS_HELLO_SPECIAL_CONN); +} + +/** + * kdbus_conn_is_activator() - Check if connection is an activator + * @conn: The connection to check + * + * Return: Non-zero if the connection is an activator + */ +static inline int kdbus_conn_is_activator(const struct kdbus_conn *conn) +{ + return conn->flags & KDBUS_HELLO_ACTIVATOR; +} + +/** + * kdbus_conn_is_policy_holder() - Check if connection is a policy holder + * @conn: The connection to check + * + * Return: Non-zero if the connection is a policy holder + */ +static inline int kdbus_conn_is_policy_holder(const struct kdbus_conn *conn) +{ + return conn->flags & KDBUS_HELLO_POLICY_HOLDER; +} + +/** + * kdbus_conn_is_monitor() - Check if connection is a monitor + * @conn: The connection to check + * + * Return: Non-zero if the connection is a monitor + */ +static inline int kdbus_conn_is_monitor(const struct kdbus_conn *conn) +{ + return conn->flags & KDBUS_HELLO_MONITOR; +} + +/** + * kdbus_conn_lock2() - Lock two connections + * @a: connection A to lock or NULL + * @b: connection B to lock or NULL + * + * Lock two connections at once. As we need to have a stable locking order, we + * always lock the connection with lower memory address first. + */ +static inline void kdbus_conn_lock2(struct kdbus_conn *a, struct kdbus_conn *b) +{ + if (a < b) { + if (a) + mutex_lock(&a->lock); + if (b && b != a) + mutex_lock_nested(&b->lock, !!a); + } else { + if (b) + mutex_lock(&b->lock); + if (a && a != b) + mutex_lock_nested(&a->lock, !!b); + } +} + +/** + * kdbus_conn_unlock2() - Unlock two connections + * @a: connection A to unlock or NULL + * @b: connection B to unlock or NULL + * + * Unlock two connections at once. See kdbus_conn_lock2(). + */ +static inline void kdbus_conn_unlock2(struct kdbus_conn *a, + struct kdbus_conn *b) +{ + if (a) + mutex_unlock(&a->lock); + if (b && b != a) + mutex_unlock(&b->lock); +} + +/** + * kdbus_conn_assert_active() - lockdep assert on active lock + * @conn: connection that shall be active + * + * This verifies via lockdep that the caller holds an active reference to the + * given connection. + */ +static inline void kdbus_conn_assert_active(struct kdbus_conn *conn) +{ + lockdep_assert_held(conn); +} + +#endif diff -Nur linux-4.2/ipc/kdbus/domain.c linux-4.2-pck/ipc/kdbus/domain.c --- linux-4.2/ipc/kdbus/domain.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/domain.c 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,296 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "domain.h" +#include "handle.h" +#include "item.h" +#include "limits.h" +#include "util.h" + +static void kdbus_domain_control_free(struct kdbus_node *node) +{ + kfree(node); +} + +static struct kdbus_node *kdbus_domain_control_new(struct kdbus_domain *domain, + unsigned int access) +{ + struct kdbus_node *node; + int ret; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return ERR_PTR(-ENOMEM); + + kdbus_node_init(node, KDBUS_NODE_CONTROL); + + node->free_cb = kdbus_domain_control_free; + node->mode = domain->node.mode; + node->mode = S_IRUSR | S_IWUSR; + if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD)) + node->mode |= S_IRGRP | S_IWGRP; + if (access & KDBUS_MAKE_ACCESS_WORLD) + node->mode |= S_IROTH | S_IWOTH; + + ret = kdbus_node_link(node, &domain->node, "control"); + if (ret < 0) + goto exit_free; + + return node; + +exit_free: + kdbus_node_drain(node); + kdbus_node_unref(node); + return ERR_PTR(ret); +} + +static void kdbus_domain_free(struct kdbus_node *node) +{ + struct kdbus_domain *domain = + container_of(node, struct kdbus_domain, node); + + put_user_ns(domain->user_namespace); + ida_destroy(&domain->user_ida); + idr_destroy(&domain->user_idr); + kfree(domain); +} + +/** + * kdbus_domain_new() - create a new domain + * @access: The access mode for this node (KDBUS_MAKE_ACCESS_*) + * + * Return: a new kdbus_domain on success, ERR_PTR on failure + */ +struct kdbus_domain *kdbus_domain_new(unsigned int access) +{ + struct kdbus_domain *d; + int ret; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return ERR_PTR(-ENOMEM); + + kdbus_node_init(&d->node, KDBUS_NODE_DOMAIN); + + d->node.free_cb = kdbus_domain_free; + d->node.mode = S_IRUSR | S_IXUSR; + if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD)) + d->node.mode |= S_IRGRP | S_IXGRP; + if (access & KDBUS_MAKE_ACCESS_WORLD) + d->node.mode |= S_IROTH | S_IXOTH; + + mutex_init(&d->lock); + idr_init(&d->user_idr); + ida_init(&d->user_ida); + + /* Pin user namespace so we can guarantee domain-unique bus * names. */ + d->user_namespace = get_user_ns(current_user_ns()); + + ret = kdbus_node_link(&d->node, NULL, NULL); + if (ret < 0) + goto exit_unref; + + return d; + +exit_unref: + kdbus_node_drain(&d->node); + kdbus_node_unref(&d->node); + return ERR_PTR(ret); +} + +/** + * kdbus_domain_ref() - take a domain reference + * @domain: Domain + * + * Return: the domain itself + */ +struct kdbus_domain *kdbus_domain_ref(struct kdbus_domain *domain) +{ + if (domain) + kdbus_node_ref(&domain->node); + return domain; +} + +/** + * kdbus_domain_unref() - drop a domain reference + * @domain: Domain + * + * When the last reference is dropped, the domain internal structure + * is freed. + * + * Return: NULL + */ +struct kdbus_domain *kdbus_domain_unref(struct kdbus_domain *domain) +{ + if (domain) + kdbus_node_unref(&domain->node); + return NULL; +} + +/** + * kdbus_domain_populate() - populate static domain nodes + * @domain: domain to populate + * @access: KDBUS_MAKE_ACCESS_* access restrictions for new nodes + * + * Allocate and activate static sub-nodes of the given domain. This will fail if + * you call it on a non-active node or if the domain was already populated. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_domain_populate(struct kdbus_domain *domain, unsigned int access) +{ + struct kdbus_node *control; + + /* + * Create a control-node for this domain. We drop our own reference + * immediately, effectively causing the node to be deactivated and + * released when the parent domain is. + */ + control = kdbus_domain_control_new(domain, access); + if (IS_ERR(control)) + return PTR_ERR(control); + + kdbus_node_activate(control); + kdbus_node_unref(control); + return 0; +} + +/** + * kdbus_user_lookup() - lookup a kdbus_user object + * @domain: domain of the user + * @uid: uid of the user; INVALID_UID for an anon user + * + * Lookup the kdbus user accounting object for the given domain. If INVALID_UID + * is passed, a new anonymous user is created which is private to the caller. + * + * Return: The user object is returned, ERR_PTR on failure. + */ +struct kdbus_user *kdbus_user_lookup(struct kdbus_domain *domain, kuid_t uid) +{ + struct kdbus_user *u = NULL, *old = NULL; + int ret; + + mutex_lock(&domain->lock); + + if (uid_valid(uid)) { + old = idr_find(&domain->user_idr, __kuid_val(uid)); + /* + * If the object is about to be destroyed, ignore it and + * replace the slot in the IDR later on. + */ + if (old && kref_get_unless_zero(&old->kref)) { + mutex_unlock(&domain->lock); + return old; + } + } + + u = kzalloc(sizeof(*u), GFP_KERNEL); + if (!u) { + ret = -ENOMEM; + goto exit; + } + + kref_init(&u->kref); + u->domain = kdbus_domain_ref(domain); + u->uid = uid; + atomic_set(&u->buses, 0); + atomic_set(&u->connections, 0); + + if (uid_valid(uid)) { + if (old) { + idr_replace(&domain->user_idr, u, __kuid_val(uid)); + old->uid = INVALID_UID; /* mark old as removed */ + } else { + ret = idr_alloc(&domain->user_idr, u, __kuid_val(uid), + __kuid_val(uid) + 1, GFP_KERNEL); + if (ret < 0) + goto exit; + } + } + + /* + * Allocate the smallest possible index for this user; used + * in arrays for accounting user quota in receiver queues. + */ + ret = ida_simple_get(&domain->user_ida, 1, 0, GFP_KERNEL); + if (ret < 0) + goto exit; + + u->id = ret; + mutex_unlock(&domain->lock); + return u; + +exit: + if (u) { + if (uid_valid(u->uid)) + idr_remove(&domain->user_idr, __kuid_val(u->uid)); + kdbus_domain_unref(u->domain); + kfree(u); + } + mutex_unlock(&domain->lock); + return ERR_PTR(ret); +} + +static void __kdbus_user_free(struct kref *kref) +{ + struct kdbus_user *user = container_of(kref, struct kdbus_user, kref); + + WARN_ON(atomic_read(&user->buses) > 0); + WARN_ON(atomic_read(&user->connections) > 0); + + mutex_lock(&user->domain->lock); + ida_simple_remove(&user->domain->user_ida, user->id); + if (uid_valid(user->uid)) + idr_remove(&user->domain->user_idr, __kuid_val(user->uid)); + mutex_unlock(&user->domain->lock); + + kdbus_domain_unref(user->domain); + kfree(user); +} + +/** + * kdbus_user_ref() - take a user reference + * @u: User + * + * Return: @u is returned + */ +struct kdbus_user *kdbus_user_ref(struct kdbus_user *u) +{ + if (u) + kref_get(&u->kref); + return u; +} + +/** + * kdbus_user_unref() - drop a user reference + * @u: User + * + * Return: NULL + */ +struct kdbus_user *kdbus_user_unref(struct kdbus_user *u) +{ + if (u) + kref_put(&u->kref, __kdbus_user_free); + return NULL; +} diff -Nur linux-4.2/ipc/kdbus/domain.h linux-4.2-pck/ipc/kdbus/domain.h --- linux-4.2/ipc/kdbus/domain.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/domain.h 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_DOMAIN_H +#define __KDBUS_DOMAIN_H + +#include +#include +#include +#include + +#include "node.h" + +/** + * struct kdbus_domain - domain for buses + * @node: Underlying API node + * @lock: Domain data lock + * @last_id: Last used object id + * @user_idr: Set of all users indexed by UID + * @user_ida: Set of all users to compute small indices + * @user_namespace: User namespace, pinned at creation time + * @dentry: Root dentry of VFS mount (don't use outside of kdbusfs) + */ +struct kdbus_domain { + struct kdbus_node node; + struct mutex lock; + atomic64_t last_id; + struct idr user_idr; + struct ida user_ida; + struct user_namespace *user_namespace; + struct dentry *dentry; +}; + +/** + * struct kdbus_user - resource accounting for users + * @kref: Reference counter + * @domain: Domain of the user + * @id: Index of this user + * @uid: UID of the user + * @buses: Number of buses the user has created + * @connections: Number of connections the user has created + */ +struct kdbus_user { + struct kref kref; + struct kdbus_domain *domain; + unsigned int id; + kuid_t uid; + atomic_t buses; + atomic_t connections; +}; + +#define kdbus_domain_from_node(_node) \ + container_of((_node), struct kdbus_domain, node) + +struct kdbus_domain *kdbus_domain_new(unsigned int access); +struct kdbus_domain *kdbus_domain_ref(struct kdbus_domain *domain); +struct kdbus_domain *kdbus_domain_unref(struct kdbus_domain *domain); +int kdbus_domain_populate(struct kdbus_domain *domain, unsigned int access); + +#define KDBUS_USER_KERNEL_ID 0 /* ID 0 is reserved for kernel accounting */ + +struct kdbus_user *kdbus_user_lookup(struct kdbus_domain *domain, kuid_t uid); +struct kdbus_user *kdbus_user_ref(struct kdbus_user *u); +struct kdbus_user *kdbus_user_unref(struct kdbus_user *u); + +#endif diff -Nur linux-4.2/ipc/kdbus/endpoint.c linux-4.2-pck/ipc/kdbus/endpoint.c --- linux-4.2/ipc/kdbus/endpoint.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/endpoint.c 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,303 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "domain.h" +#include "endpoint.h" +#include "handle.h" +#include "item.h" +#include "message.h" +#include "policy.h" + +static void kdbus_ep_free(struct kdbus_node *node) +{ + struct kdbus_ep *ep = container_of(node, struct kdbus_ep, node); + + WARN_ON(!list_empty(&ep->conn_list)); + + kdbus_policy_db_clear(&ep->policy_db); + kdbus_bus_unref(ep->bus); + kdbus_user_unref(ep->user); + kfree(ep); +} + +static void kdbus_ep_release(struct kdbus_node *node, bool was_active) +{ + struct kdbus_ep *ep = container_of(node, struct kdbus_ep, node); + + /* disconnect all connections to this endpoint */ + for (;;) { + struct kdbus_conn *conn; + + mutex_lock(&ep->lock); + conn = list_first_entry_or_null(&ep->conn_list, + struct kdbus_conn, + ep_entry); + if (!conn) { + mutex_unlock(&ep->lock); + break; + } + + /* take reference, release lock, disconnect without lock */ + kdbus_conn_ref(conn); + mutex_unlock(&ep->lock); + + kdbus_conn_disconnect(conn, false); + kdbus_conn_unref(conn); + } +} + +/** + * kdbus_ep_new() - create a new endpoint + * @bus: The bus this endpoint will be created for + * @name: The name of the endpoint + * @access: The access flags for this node (KDBUS_MAKE_ACCESS_*) + * @uid: The uid of the node + * @gid: The gid of the node + * @is_custom: Whether this is a custom endpoint + * + * This function will create a new endpoint with the given + * name and properties for a given bus. + * + * Return: a new kdbus_ep on success, ERR_PTR on failure. + */ +struct kdbus_ep *kdbus_ep_new(struct kdbus_bus *bus, const char *name, + unsigned int access, kuid_t uid, kgid_t gid, + bool is_custom) +{ + struct kdbus_ep *e; + int ret; + + /* + * Validate only custom endpoints names, default endpoints + * with a "bus" name are created when the bus is created + */ + if (is_custom) { + ret = kdbus_verify_uid_prefix(name, bus->domain->user_namespace, + uid); + if (ret < 0) + return ERR_PTR(ret); + } + + e = kzalloc(sizeof(*e), GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + + kdbus_node_init(&e->node, KDBUS_NODE_ENDPOINT); + + e->node.free_cb = kdbus_ep_free; + e->node.release_cb = kdbus_ep_release; + e->node.uid = uid; + e->node.gid = gid; + e->node.mode = S_IRUSR | S_IWUSR; + if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD)) + e->node.mode |= S_IRGRP | S_IWGRP; + if (access & KDBUS_MAKE_ACCESS_WORLD) + e->node.mode |= S_IROTH | S_IWOTH; + + mutex_init(&e->lock); + INIT_LIST_HEAD(&e->conn_list); + kdbus_policy_db_init(&e->policy_db); + e->bus = kdbus_bus_ref(bus); + + ret = kdbus_node_link(&e->node, &bus->node, name); + if (ret < 0) + goto exit_unref; + + /* + * Transactions on custom endpoints are never accounted on the global + * user limits. Instead, for each custom endpoint, we create a custom, + * unique user, which all transactions are accounted on. Regardless of + * the user using that endpoint, it is always accounted on the same + * user-object. This budget is not shared with ordinary users on + * non-custom endpoints. + */ + if (is_custom) { + e->user = kdbus_user_lookup(bus->domain, INVALID_UID); + if (IS_ERR(e->user)) { + ret = PTR_ERR(e->user); + e->user = NULL; + goto exit_unref; + } + } + + return e; + +exit_unref: + kdbus_node_drain(&e->node); + kdbus_node_unref(&e->node); + return ERR_PTR(ret); +} + +/** + * kdbus_ep_ref() - increase the reference counter of a kdbus_ep + * @ep: The endpoint to reference + * + * Every user of an endpoint, except for its creator, must add a reference to + * the kdbus_ep instance using this function. + * + * Return: the ep itself + */ +struct kdbus_ep *kdbus_ep_ref(struct kdbus_ep *ep) +{ + if (ep) + kdbus_node_ref(&ep->node); + return ep; +} + +/** + * kdbus_ep_unref() - decrease the reference counter of a kdbus_ep + * @ep: The ep to unref + * + * Release a reference. If the reference count drops to 0, the ep will be + * freed. + * + * Return: NULL + */ +struct kdbus_ep *kdbus_ep_unref(struct kdbus_ep *ep) +{ + if (ep) + kdbus_node_unref(&ep->node); + return NULL; +} + +/** + * kdbus_ep_is_privileged() - check whether a file is privileged + * @ep: endpoint to operate on + * @file: file to test + * + * Return: True if @file is privileged in the domain of @ep. + */ +bool kdbus_ep_is_privileged(struct kdbus_ep *ep, struct file *file) +{ + return !ep->user && + file_ns_capable(file, ep->bus->domain->user_namespace, + CAP_IPC_OWNER); +} + +/** + * kdbus_ep_is_owner() - check whether a file should be treated as bus owner + * @ep: endpoint to operate on + * @file: file to test + * + * Return: True if @file should be treated as bus owner on @ep + */ +bool kdbus_ep_is_owner(struct kdbus_ep *ep, struct file *file) +{ + return !ep->user && + (uid_eq(file->f_cred->euid, ep->bus->node.uid) || + kdbus_ep_is_privileged(ep, file)); +} + +/** + * kdbus_cmd_ep_make() - handle KDBUS_CMD_ENDPOINT_MAKE + * @bus: bus to operate on + * @argp: command payload + * + * Return: NULL or newly created endpoint on success, ERR_PTR on failure. + */ +struct kdbus_ep *kdbus_cmd_ep_make(struct kdbus_bus *bus, void __user *argp) +{ + const char *item_make_name; + struct kdbus_ep *ep = NULL; + struct kdbus_cmd *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_MAKE_NAME, .mandatory = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_MAKE_ACCESS_GROUP | + KDBUS_MAKE_ACCESS_WORLD, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + + item_make_name = argv[1].item->str; + + ep = kdbus_ep_new(bus, item_make_name, cmd->flags, + current_euid(), current_egid(), true); + if (IS_ERR(ep)) { + ret = PTR_ERR(ep); + ep = NULL; + goto exit; + } + + if (!kdbus_node_activate(&ep->node)) { + ret = -ESHUTDOWN; + goto exit; + } + +exit: + ret = kdbus_args_clear(&args, ret); + if (ret < 0) { + if (ep) { + kdbus_node_drain(&ep->node); + kdbus_ep_unref(ep); + } + return ERR_PTR(ret); + } + return ep; +} + +/** + * kdbus_cmd_ep_update() - handle KDBUS_CMD_ENDPOINT_UPDATE + * @ep: endpoint to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_ep_update(struct kdbus_ep *ep, void __user *argp) +{ + struct kdbus_cmd *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_NAME, .multiple = true }, + { .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + ret = kdbus_policy_set(&ep->policy_db, args.items, args.items_size, + 0, true, ep); + return kdbus_args_clear(&args, ret); +} diff -Nur linux-4.2/ipc/kdbus/endpoint.h linux-4.2-pck/ipc/kdbus/endpoint.h --- linux-4.2/ipc/kdbus/endpoint.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/endpoint.h 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_ENDPOINT_H +#define __KDBUS_ENDPOINT_H + +#include +#include +#include +#include "node.h" +#include "policy.h" + +struct kdbus_bus; +struct kdbus_user; + +/** + * struct kdbus_ep - endpoint to access a bus + * @node: The kdbus node + * @lock: Endpoint data lock + * @bus: Bus behind this endpoint + * @user: Custom enpoints account against an anonymous user + * @policy_db: Uploaded policy + * @conn_list: Connections of this endpoint + * + * An endpoint offers access to a bus; the default endpoint node name is "bus". + * Additional custom endpoints to the same bus can be created and they can + * carry their own policies/filters. + */ +struct kdbus_ep { + struct kdbus_node node; + struct mutex lock; + + /* static */ + struct kdbus_bus *bus; + struct kdbus_user *user; + + /* protected by own locks */ + struct kdbus_policy_db policy_db; + + /* protected by ep->lock */ + struct list_head conn_list; +}; + +#define kdbus_ep_from_node(_node) \ + container_of((_node), struct kdbus_ep, node) + +struct kdbus_ep *kdbus_ep_new(struct kdbus_bus *bus, const char *name, + unsigned int access, kuid_t uid, kgid_t gid, + bool policy); +struct kdbus_ep *kdbus_ep_ref(struct kdbus_ep *ep); +struct kdbus_ep *kdbus_ep_unref(struct kdbus_ep *ep); + +bool kdbus_ep_is_privileged(struct kdbus_ep *ep, struct file *file); +bool kdbus_ep_is_owner(struct kdbus_ep *ep, struct file *file); + +struct kdbus_ep *kdbus_cmd_ep_make(struct kdbus_bus *bus, void __user *argp); +int kdbus_cmd_ep_update(struct kdbus_ep *ep, void __user *argp); + +#endif diff -Nur linux-4.2/ipc/kdbus/fs.c linux-4.2-pck/ipc/kdbus/fs.c --- linux-4.2/ipc/kdbus/fs.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/fs.c 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,508 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "domain.h" +#include "endpoint.h" +#include "fs.h" +#include "handle.h" +#include "node.h" + +#define kdbus_node_from_dentry(_dentry) \ + ((struct kdbus_node *)(_dentry)->d_fsdata) + +static struct inode *fs_inode_get(struct super_block *sb, + struct kdbus_node *node); + +/* + * Directory Management + */ + +static inline unsigned char kdbus_dt_type(struct kdbus_node *node) +{ + switch (node->type) { + case KDBUS_NODE_DOMAIN: + case KDBUS_NODE_BUS: + return DT_DIR; + case KDBUS_NODE_CONTROL: + case KDBUS_NODE_ENDPOINT: + return DT_REG; + } + + return DT_UNKNOWN; +} + +static int fs_dir_fop_iterate(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct kdbus_node *parent = kdbus_node_from_dentry(dentry); + struct kdbus_node *old, *next = file->private_data; + + /* + * kdbusfs directory iterator (modelled after sysfs/kernfs) + * When iterating kdbusfs directories, we iterate all children of the + * parent kdbus_node object. We use ctx->pos to store the hash of the + * child and file->private_data to store a reference to the next node + * object. If ctx->pos is not modified via llseek while you iterate a + * directory, then we use the file->private_data node pointer to + * directly access the next node in the tree. + * However, if you directly seek on the directory, we have to find the + * closest node to that position and cannot use our node pointer. This + * means iterating the rb-tree to find the closest match and start over + * from there. + * Note that hash values are not necessarily unique. Therefore, llseek + * is not guaranteed to seek to the same node that you got when you + * retrieved the position. Seeking to 0, 1, 2 and >=INT_MAX is safe, + * though. We could use the inode-number as position, but this would + * require another rb-tree for fast access. Kernfs and others already + * ignore those conflicts, so we should be fine, too. + */ + + if (!dir_emit_dots(file, ctx)) + return 0; + + /* acquire @next; if deactivated, or seek detected, find next node */ + old = next; + if (next && ctx->pos == next->hash) { + if (kdbus_node_acquire(next)) + kdbus_node_ref(next); + else + next = kdbus_node_next_child(parent, next); + } else { + next = kdbus_node_find_closest(parent, ctx->pos); + } + kdbus_node_unref(old); + + while (next) { + /* emit @next */ + file->private_data = next; + ctx->pos = next->hash; + + kdbus_node_release(next); + + if (!dir_emit(ctx, next->name, strlen(next->name), next->id, + kdbus_dt_type(next))) + return 0; + + /* find next node after @next */ + old = next; + next = kdbus_node_next_child(parent, next); + kdbus_node_unref(old); + } + + file->private_data = NULL; + ctx->pos = INT_MAX; + + return 0; +} + +static loff_t fs_dir_fop_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + /* protect f_off against fop_iterate */ + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} + +static int fs_dir_fop_release(struct inode *inode, struct file *file) +{ + kdbus_node_unref(file->private_data); + return 0; +} + +static const struct file_operations fs_dir_fops = { + .read = generic_read_dir, + .iterate = fs_dir_fop_iterate, + .llseek = fs_dir_fop_llseek, + .release = fs_dir_fop_release, +}; + +static struct dentry *fs_dir_iop_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct dentry *dnew = NULL; + struct kdbus_node *parent; + struct kdbus_node *node; + struct inode *inode; + + parent = kdbus_node_from_dentry(dentry->d_parent); + if (!kdbus_node_acquire(parent)) + return NULL; + + /* returns reference to _acquired_ child node */ + node = kdbus_node_find_child(parent, dentry->d_name.name); + if (node) { + dentry->d_fsdata = node; + inode = fs_inode_get(dir->i_sb, node); + if (IS_ERR(inode)) + dnew = ERR_CAST(inode); + else + dnew = d_splice_alias(inode, dentry); + + kdbus_node_release(node); + } + + kdbus_node_release(parent); + return dnew; +} + +static const struct inode_operations fs_dir_iops = { + .permission = generic_permission, + .lookup = fs_dir_iop_lookup, +}; + +/* + * Inode Management + */ + +static const struct inode_operations fs_inode_iops = { + .permission = generic_permission, +}; + +static struct inode *fs_inode_get(struct super_block *sb, + struct kdbus_node *node) +{ + struct inode *inode; + + inode = iget_locked(sb, node->id); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + inode->i_private = kdbus_node_ref(node); + inode->i_mapping->a_ops = &empty_aops; + inode->i_mode = node->mode & S_IALLUGO; + inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode->i_uid = node->uid; + inode->i_gid = node->gid; + + switch (node->type) { + case KDBUS_NODE_DOMAIN: + case KDBUS_NODE_BUS: + inode->i_mode |= S_IFDIR; + inode->i_op = &fs_dir_iops; + inode->i_fop = &fs_dir_fops; + set_nlink(inode, 2); + break; + case KDBUS_NODE_CONTROL: + case KDBUS_NODE_ENDPOINT: + inode->i_mode |= S_IFREG; + inode->i_op = &fs_inode_iops; + inode->i_fop = &kdbus_handle_ops; + break; + } + + unlock_new_inode(inode); + + return inode; +} + +/* + * Superblock Management + */ + +static int fs_super_dop_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct kdbus_node *node; + + /* Force lookup on negatives */ + if (!dentry->d_inode) + return 0; + + node = kdbus_node_from_dentry(dentry); + + /* see whether the node has been removed */ + if (!kdbus_node_is_active(node)) + return 0; + + return 1; +} + +static void fs_super_dop_release(struct dentry *dentry) +{ + kdbus_node_unref(dentry->d_fsdata); +} + +static const struct dentry_operations fs_super_dops = { + .d_revalidate = fs_super_dop_revalidate, + .d_release = fs_super_dop_release, +}; + +static void fs_super_sop_evict_inode(struct inode *inode) +{ + struct kdbus_node *node = kdbus_node_from_inode(inode); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + kdbus_node_unref(node); +} + +static const struct super_operations fs_super_sops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = fs_super_sop_evict_inode, +}; + +static int fs_super_fill(struct super_block *sb) +{ + struct kdbus_domain *domain = sb->s_fs_info; + struct inode *inode; + int ret; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = KDBUS_SUPER_MAGIC; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_op = &fs_super_sops; + sb->s_time_gran = 1; + + inode = fs_inode_get(sb, &domain->node); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + /* d_make_root iput()s the inode on failure */ + return -ENOMEM; + } + + /* sb holds domain reference */ + sb->s_root->d_fsdata = &domain->node; + sb->s_d_op = &fs_super_dops; + + /* sb holds root reference */ + domain->dentry = sb->s_root; + + if (!kdbus_node_activate(&domain->node)) + return -ESHUTDOWN; + + ret = kdbus_domain_populate(domain, KDBUS_MAKE_ACCESS_WORLD); + if (ret < 0) + return ret; + + sb->s_flags |= MS_ACTIVE; + return 0; +} + +static void fs_super_kill(struct super_block *sb) +{ + struct kdbus_domain *domain = sb->s_fs_info; + + if (domain) { + kdbus_node_drain(&domain->node); + domain->dentry = NULL; + } + + kill_anon_super(sb); + kdbus_domain_unref(domain); +} + +static int fs_super_set(struct super_block *sb, void *data) +{ + int ret; + + ret = set_anon_super(sb, data); + if (!ret) + sb->s_fs_info = data; + + return ret; +} + +static struct dentry *fs_super_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + struct kdbus_domain *domain; + struct super_block *sb; + int ret; + + domain = kdbus_domain_new(KDBUS_MAKE_ACCESS_WORLD); + if (IS_ERR(domain)) + return ERR_CAST(domain); + + sb = sget(fs_type, NULL, fs_super_set, flags, domain); + if (IS_ERR(sb)) { + kdbus_node_drain(&domain->node); + kdbus_domain_unref(domain); + return ERR_CAST(sb); + } + + WARN_ON(sb->s_fs_info != domain); + WARN_ON(sb->s_root); + + ret = fs_super_fill(sb); + if (ret < 0) { + /* calls into ->kill_sb() when done */ + deactivate_locked_super(sb); + return ERR_PTR(ret); + } + + return dget(sb->s_root); +} + +static struct file_system_type fs_type = { + .name = KBUILD_MODNAME "fs", + .owner = THIS_MODULE, + .mount = fs_super_mount, + .kill_sb = fs_super_kill, + .fs_flags = FS_USERNS_MOUNT, +}; + +/** + * kdbus_fs_init() - register kdbus filesystem + * + * This registers a filesystem with the VFS layer. The filesystem is called + * `KBUILD_MODNAME "fs"', which usually resolves to `kdbusfs'. The nameing + * scheme allows to set KBUILD_MODNAME to "kdbus2" and you will get an + * independent filesystem for developers. + * + * Each mount of the kdbusfs filesystem has an kdbus_domain attached. + * Operations on this mount will only affect the attached domain. On each mount + * a new domain is automatically created and used for this mount exclusively. + * If you want to share a domain across multiple mounts, you need to bind-mount + * it. + * + * Mounts of kdbusfs (with a different domain each) are unrelated to each other + * and will never have any effect on any domain but their own. + * + * Return: 0 on success, negative error otherwise. + */ +int kdbus_fs_init(void) +{ + return register_filesystem(&fs_type); +} + +/** + * kdbus_fs_exit() - unregister kdbus filesystem + * + * This does the reverse to kdbus_fs_init(). It unregisters the kdbusfs + * filesystem from VFS and cleans up any allocated resources. + */ +void kdbus_fs_exit(void) +{ + unregister_filesystem(&fs_type); +} + +/* acquire domain of @node, making sure all ancestors are active */ +static struct kdbus_domain *fs_acquire_domain(struct kdbus_node *node) +{ + struct kdbus_domain *domain; + struct kdbus_node *iter; + + /* caller must guarantee that @node is linked */ + for (iter = node; iter->parent; iter = iter->parent) + if (!kdbus_node_is_active(iter->parent)) + return NULL; + + /* root nodes are always domains */ + if (WARN_ON(iter->type != KDBUS_NODE_DOMAIN)) + return NULL; + + domain = kdbus_domain_from_node(iter); + if (!kdbus_node_acquire(&domain->node)) + return NULL; + + return domain; +} + +/** + * kdbus_fs_flush() - flush dcache entries of a node + * @node: Node to flush entries of + * + * This flushes all VFS filesystem cache entries for a node and all its + * children. This should be called whenever a node is destroyed during + * runtime. It will flush the cache entries so the linked objects can be + * deallocated. + * + * This is a no-op if you call it on active nodes (they really should stay in + * cache) or on nodes with deactivated parents (flushing the parent is enough). + * Furthermore, there is no need to call it on nodes whose lifetime is bound to + * their parents'. In those cases, the parent-flush will always also flush the + * children. + */ +void kdbus_fs_flush(struct kdbus_node *node) +{ + struct dentry *dentry, *parent_dentry = NULL; + struct kdbus_domain *domain; + struct qstr name; + + /* active nodes should remain in cache */ + if (!kdbus_node_is_deactivated(node)) + return; + + /* nodes that were never linked were never instantiated */ + if (!node->parent) + return; + + /* acquire domain and verify all ancestors are active */ + domain = fs_acquire_domain(node); + if (!domain) + return; + + switch (node->type) { + case KDBUS_NODE_ENDPOINT: + if (WARN_ON(!node->parent || !node->parent->name)) + goto exit; + + name.name = node->parent->name; + name.len = strlen(node->parent->name); + parent_dentry = d_hash_and_lookup(domain->dentry, &name); + if (IS_ERR_OR_NULL(parent_dentry)) + goto exit; + + /* fallthrough */ + case KDBUS_NODE_BUS: + if (WARN_ON(!node->name)) + goto exit; + + name.name = node->name; + name.len = strlen(node->name); + dentry = d_hash_and_lookup(parent_dentry ? : domain->dentry, + &name); + if (!IS_ERR_OR_NULL(dentry)) { + d_invalidate(dentry); + dput(dentry); + } + + dput(parent_dentry); + break; + + default: + /* all other types are bound to their parent lifetime */ + break; + } + +exit: + kdbus_node_release(&domain->node); +} diff -Nur linux-4.2/ipc/kdbus/fs.h linux-4.2-pck/ipc/kdbus/fs.h --- linux-4.2/ipc/kdbus/fs.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/fs.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUSFS_H +#define __KDBUSFS_H + +#include + +struct kdbus_node; + +int kdbus_fs_init(void); +void kdbus_fs_exit(void); +void kdbus_fs_flush(struct kdbus_node *node); + +#define kdbus_node_from_inode(_inode) \ + ((struct kdbus_node *)(_inode)->i_private) + +#endif diff -Nur linux-4.2/ipc/kdbus/handle.c linux-4.2-pck/ipc/kdbus/handle.c --- linux-4.2/ipc/kdbus/handle.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/handle.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,691 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "fs.h" +#include "handle.h" +#include "item.h" +#include "match.h" +#include "message.h" +#include "names.h" +#include "domain.h" +#include "policy.h" + +static int kdbus_args_verify(struct kdbus_args *args) +{ + struct kdbus_item *item; + size_t i; + int ret; + + KDBUS_ITEMS_FOREACH(item, args->items, args->items_size) { + struct kdbus_arg *arg = NULL; + + if (!KDBUS_ITEM_VALID(item, args->items, args->items_size)) + return -EINVAL; + + for (i = 0; i < args->argc; ++i) + if (args->argv[i].type == item->type) + break; + if (i >= args->argc) + return -EINVAL; + + arg = &args->argv[i]; + + ret = kdbus_item_validate(item); + if (ret < 0) + return ret; + + if (arg->item && !arg->multiple) + return -EINVAL; + + arg->item = item; + } + + if (!KDBUS_ITEMS_END(item, args->items, args->items_size)) + return -EINVAL; + + return 0; +} + +static int kdbus_args_negotiate(struct kdbus_args *args) +{ + struct kdbus_item __user *user; + struct kdbus_item *negotiation; + size_t i, j, num; + + /* + * If KDBUS_FLAG_NEGOTIATE is set, we overwrite the flags field with + * the set of supported flags. Furthermore, if an KDBUS_ITEM_NEGOTIATE + * item is passed, we iterate its payload (array of u64, each set to an + * item type) and clear all unsupported item-types to 0. + * The caller might do this recursively, if other flags or objects are + * embedded in the payload itself. + */ + + if (args->cmd->flags & KDBUS_FLAG_NEGOTIATE) { + if (put_user(args->allowed_flags & ~KDBUS_FLAG_NEGOTIATE, + &args->user->flags)) + return -EFAULT; + } + + if (args->argc < 1 || args->argv[0].type != KDBUS_ITEM_NEGOTIATE || + !args->argv[0].item) + return 0; + + negotiation = args->argv[0].item; + user = (struct kdbus_item __user *) + ((u8 __user *)args->user + + ((u8 *)negotiation - (u8 *)args->cmd)); + num = KDBUS_ITEM_PAYLOAD_SIZE(negotiation) / sizeof(u64); + + for (i = 0; i < num; ++i) { + for (j = 0; j < args->argc; ++j) + if (negotiation->data64[i] == args->argv[j].type) + break; + + if (j < args->argc) + continue; + + /* this item is not supported, clear it out */ + negotiation->data64[i] = 0; + if (put_user(negotiation->data64[i], &user->data64[i])) + return -EFAULT; + } + + return 0; +} + +/** + * __kdbus_args_parse() - parse payload of kdbus command + * @args: object to parse data into + * @is_cmd: whether this is a command or msg payload + * @argp: user-space location of command payload to parse + * @type_size: overall size of command payload to parse + * @items_offset: offset of items array in command payload + * @out: output variable to store pointer to copied payload + * + * This parses the ioctl payload at user-space location @argp into @args. @args + * must be pre-initialized by the caller to reflect the supported flags and + * items of this command. This parser will then copy the command payload into + * kernel-space, verify correctness and consistency and cache pointers to parsed + * items and other data in @args. + * + * If this function succeeded, you must call kdbus_args_clear() to release + * allocated resources before destroying @args. + * + * This can also be used to import kdbus_msg objects. In that case, @is_cmd must + * be set to 'false' and the 'return_flags' field will not be touched (as it + * doesn't exist on kdbus_msg). + * + * Return: On failure a negative error code is returned. Otherwise, 1 is + * returned if negotiation was requested, 0 if not. + */ +int __kdbus_args_parse(struct kdbus_args *args, bool is_cmd, void __user *argp, + size_t type_size, size_t items_offset, void **out) +{ + u64 user_size; + int ret, i; + + ret = kdbus_copy_from_user(&user_size, argp, sizeof(user_size)); + if (ret < 0) + return ret; + + if (user_size < type_size) + return -EINVAL; + if (user_size > KDBUS_CMD_MAX_SIZE) + return -EMSGSIZE; + + if (user_size <= sizeof(args->cmd_buf)) { + if (copy_from_user(args->cmd_buf, argp, user_size)) + return -EFAULT; + args->cmd = (void*)args->cmd_buf; + } else { + args->cmd = memdup_user(argp, user_size); + if (IS_ERR(args->cmd)) + return PTR_ERR(args->cmd); + } + + if (args->cmd->size != user_size) { + ret = -EINVAL; + goto error; + } + + if (is_cmd) + args->cmd->return_flags = 0; + args->user = argp; + args->items = (void *)((u8 *)args->cmd + items_offset); + args->items_size = args->cmd->size - items_offset; + args->is_cmd = is_cmd; + + if (args->cmd->flags & ~args->allowed_flags) { + ret = -EINVAL; + goto error; + } + + ret = kdbus_args_verify(args); + if (ret < 0) + goto error; + + ret = kdbus_args_negotiate(args); + if (ret < 0) + goto error; + + /* mandatory items must be given (but not on negotiation) */ + if (!(args->cmd->flags & KDBUS_FLAG_NEGOTIATE)) { + for (i = 0; i < args->argc; ++i) + if (args->argv[i].mandatory && !args->argv[i].item) { + ret = -EINVAL; + goto error; + } + } + + *out = args->cmd; + return !!(args->cmd->flags & KDBUS_FLAG_NEGOTIATE); + +error: + return kdbus_args_clear(args, ret); +} + +/** + * kdbus_args_clear() - release allocated command resources + * @args: object to release resources of + * @ret: return value of this command + * + * This frees all allocated resources on @args and copies the command result + * flags into user-space. @ret is usually returned unchanged by this function, + * so it can be used in the final 'return' statement of the command handler. + * + * Return: -EFAULT if return values cannot be copied into user-space, otherwise + * @ret is returned unchanged. + */ +int kdbus_args_clear(struct kdbus_args *args, int ret) +{ + if (!args) + return ret; + + if (!IS_ERR_OR_NULL(args->cmd)) { + if (args->is_cmd && put_user(args->cmd->return_flags, + &args->user->return_flags)) + ret = -EFAULT; + if (args->cmd != (void*)args->cmd_buf) + kfree(args->cmd); + args->cmd = NULL; + } + + return ret; +} + +/** + * enum kdbus_handle_type - type an handle can be of + * @KDBUS_HANDLE_NONE: no type set, yet + * @KDBUS_HANDLE_BUS_OWNER: bus owner + * @KDBUS_HANDLE_EP_OWNER: endpoint owner + * @KDBUS_HANDLE_CONNECTED: endpoint connection after HELLO + */ +enum kdbus_handle_type { + KDBUS_HANDLE_NONE, + KDBUS_HANDLE_BUS_OWNER, + KDBUS_HANDLE_EP_OWNER, + KDBUS_HANDLE_CONNECTED, +}; + +/** + * struct kdbus_handle - handle to the kdbus system + * @lock: handle lock + * @type: type of this handle (KDBUS_HANDLE_*) + * @bus_owner: bus this handle owns + * @ep_owner: endpoint this handle owns + * @conn: connection this handle owns + */ +struct kdbus_handle { + struct mutex lock; + + enum kdbus_handle_type type; + union { + struct kdbus_bus *bus_owner; + struct kdbus_ep *ep_owner; + struct kdbus_conn *conn; + }; +}; + +static int kdbus_handle_open(struct inode *inode, struct file *file) +{ + struct kdbus_handle *handle; + struct kdbus_node *node; + int ret; + + node = kdbus_node_from_inode(inode); + if (!kdbus_node_acquire(node)) + return -ESHUTDOWN; + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) { + ret = -ENOMEM; + goto exit; + } + + mutex_init(&handle->lock); + handle->type = KDBUS_HANDLE_NONE; + + file->private_data = handle; + ret = 0; + +exit: + kdbus_node_release(node); + return ret; +} + +static int kdbus_handle_release(struct inode *inode, struct file *file) +{ + struct kdbus_handle *handle = file->private_data; + + switch (handle->type) { + case KDBUS_HANDLE_BUS_OWNER: + if (handle->bus_owner) { + kdbus_node_drain(&handle->bus_owner->node); + kdbus_bus_unref(handle->bus_owner); + } + break; + case KDBUS_HANDLE_EP_OWNER: + if (handle->ep_owner) { + kdbus_node_drain(&handle->ep_owner->node); + kdbus_ep_unref(handle->ep_owner); + } + break; + case KDBUS_HANDLE_CONNECTED: + kdbus_conn_disconnect(handle->conn, false); + kdbus_conn_unref(handle->conn); + break; + case KDBUS_HANDLE_NONE: + /* nothing to clean up */ + break; + } + + kfree(handle); + + return 0; +} + +static long kdbus_handle_ioctl_control(struct file *file, unsigned int cmd, + void __user *argp) +{ + struct kdbus_handle *handle = file->private_data; + struct kdbus_node *node = file_inode(file)->i_private; + struct kdbus_domain *domain; + int ret = 0; + + if (!kdbus_node_acquire(node)) + return -ESHUTDOWN; + + /* + * The parent of control-nodes is always a domain, make sure to pin it + * so the parent is actually valid. + */ + domain = kdbus_domain_from_node(node->parent); + if (!kdbus_node_acquire(&domain->node)) { + kdbus_node_release(node); + return -ESHUTDOWN; + } + + switch (cmd) { + case KDBUS_CMD_BUS_MAKE: { + struct kdbus_bus *bus; + + bus = kdbus_cmd_bus_make(domain, argp); + if (IS_ERR_OR_NULL(bus)) { + ret = PTR_ERR_OR_ZERO(bus); + break; + } + + handle->bus_owner = bus; + ret = KDBUS_HANDLE_BUS_OWNER; + break; + } + + default: + ret = -EBADFD; + break; + } + + kdbus_node_release(&domain->node); + kdbus_node_release(node); + return ret; +} + +static long kdbus_handle_ioctl_ep(struct file *file, unsigned int cmd, + void __user *buf) +{ + struct kdbus_handle *handle = file->private_data; + struct kdbus_node *node = file_inode(file)->i_private; + struct kdbus_ep *ep, *file_ep = kdbus_ep_from_node(node); + struct kdbus_bus *bus = file_ep->bus; + struct kdbus_conn *conn; + int ret = 0; + + if (!kdbus_node_acquire(node)) + return -ESHUTDOWN; + + switch (cmd) { + case KDBUS_CMD_ENDPOINT_MAKE: { + /* creating custom endpoints is a privileged operation */ + if (!kdbus_ep_is_owner(file_ep, file)) { + ret = -EPERM; + break; + } + + ep = kdbus_cmd_ep_make(bus, buf); + if (IS_ERR_OR_NULL(ep)) { + ret = PTR_ERR_OR_ZERO(ep); + break; + } + + handle->ep_owner = ep; + ret = KDBUS_HANDLE_EP_OWNER; + break; + } + + case KDBUS_CMD_HELLO: + conn = kdbus_cmd_hello(file_ep, file, buf); + if (IS_ERR_OR_NULL(conn)) { + ret = PTR_ERR_OR_ZERO(conn); + break; + } + + handle->conn = conn; + ret = KDBUS_HANDLE_CONNECTED; + break; + + default: + ret = -EBADFD; + break; + } + + kdbus_node_release(node); + return ret; +} + +static long kdbus_handle_ioctl_ep_owner(struct file *file, unsigned int command, + void __user *buf) +{ + struct kdbus_handle *handle = file->private_data; + struct kdbus_ep *ep = handle->ep_owner; + int ret; + + if (!kdbus_node_acquire(&ep->node)) + return -ESHUTDOWN; + + switch (command) { + case KDBUS_CMD_ENDPOINT_UPDATE: + ret = kdbus_cmd_ep_update(ep, buf); + break; + default: + ret = -EBADFD; + break; + } + + kdbus_node_release(&ep->node); + return ret; +} + +static long kdbus_handle_ioctl_connected(struct file *file, + unsigned int command, void __user *buf) +{ + struct kdbus_handle *handle = file->private_data; + struct kdbus_conn *conn = handle->conn; + struct kdbus_conn *release_conn = NULL; + int ret; + + release_conn = conn; + ret = kdbus_conn_acquire(release_conn); + if (ret < 0) + return ret; + + switch (command) { + case KDBUS_CMD_BYEBYE: + /* + * BYEBYE is special; we must not acquire a connection when + * calling into kdbus_conn_disconnect() or we will deadlock, + * because kdbus_conn_disconnect() will wait for all acquired + * references to be dropped. + */ + kdbus_conn_release(release_conn); + release_conn = NULL; + ret = kdbus_cmd_byebye_unlocked(conn, buf); + break; + case KDBUS_CMD_NAME_ACQUIRE: + ret = kdbus_cmd_name_acquire(conn, buf); + break; + case KDBUS_CMD_NAME_RELEASE: + ret = kdbus_cmd_name_release(conn, buf); + break; + case KDBUS_CMD_LIST: + ret = kdbus_cmd_list(conn, buf); + break; + case KDBUS_CMD_CONN_INFO: + ret = kdbus_cmd_conn_info(conn, buf); + break; + case KDBUS_CMD_BUS_CREATOR_INFO: + ret = kdbus_cmd_bus_creator_info(conn, buf); + break; + case KDBUS_CMD_UPDATE: + ret = kdbus_cmd_update(conn, buf); + break; + case KDBUS_CMD_MATCH_ADD: + ret = kdbus_cmd_match_add(conn, buf); + break; + case KDBUS_CMD_MATCH_REMOVE: + ret = kdbus_cmd_match_remove(conn, buf); + break; + case KDBUS_CMD_SEND: + ret = kdbus_cmd_send(conn, file, buf); + break; + case KDBUS_CMD_RECV: + ret = kdbus_cmd_recv(conn, buf); + break; + case KDBUS_CMD_FREE: + ret = kdbus_cmd_free(conn, buf); + break; + default: + ret = -EBADFD; + break; + } + + kdbus_conn_release(release_conn); + return ret; +} + +static long kdbus_handle_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct kdbus_handle *handle = file->private_data; + struct kdbus_node *node = kdbus_node_from_inode(file_inode(file)); + void __user *argp = (void __user *)arg; + long ret = -EBADFD; + + switch (cmd) { + case KDBUS_CMD_BUS_MAKE: + case KDBUS_CMD_ENDPOINT_MAKE: + case KDBUS_CMD_HELLO: + mutex_lock(&handle->lock); + if (handle->type == KDBUS_HANDLE_NONE) { + if (node->type == KDBUS_NODE_CONTROL) + ret = kdbus_handle_ioctl_control(file, cmd, + argp); + else if (node->type == KDBUS_NODE_ENDPOINT) + ret = kdbus_handle_ioctl_ep(file, cmd, argp); + + if (ret > 0) { + /* + * The data given via open() is not sufficient + * to setup a kdbus handle. Hence, we require + * the user to perform a setup ioctl. This setup + * can only be performed once and defines the + * type of the handle. The different setup + * ioctls are locked against each other so they + * cannot race. Once the handle type is set, + * the type-dependent ioctls are enabled. To + * improve performance, we don't lock those via + * handle->lock. Instead, we issue a + * write-barrier before performing the + * type-change, which pairs with smp_rmb() in + * all handlers that access the type field. This + * guarantees the handle is fully setup, if + * handle->type is set. If handle->type is + * unset, you must not make any assumptions + * without taking handle->lock. + * Note that handle->type is only set once. It + * will never change afterwards. + */ + smp_wmb(); + handle->type = ret; + } + } + mutex_unlock(&handle->lock); + break; + + case KDBUS_CMD_ENDPOINT_UPDATE: + case KDBUS_CMD_BYEBYE: + case KDBUS_CMD_NAME_ACQUIRE: + case KDBUS_CMD_NAME_RELEASE: + case KDBUS_CMD_LIST: + case KDBUS_CMD_CONN_INFO: + case KDBUS_CMD_BUS_CREATOR_INFO: + case KDBUS_CMD_UPDATE: + case KDBUS_CMD_MATCH_ADD: + case KDBUS_CMD_MATCH_REMOVE: + case KDBUS_CMD_SEND: + case KDBUS_CMD_RECV: + case KDBUS_CMD_FREE: { + enum kdbus_handle_type type; + + /* + * This read-barrier pairs with smp_wmb() of the handle setup. + * it guarantees the handle is fully written, in case the + * type has been set. It allows us to access the handle without + * taking handle->lock, given the guarantee that the type is + * only ever set once, and stays constant afterwards. + * Furthermore, the handle object itself is not modified in any + * way after the type is set. That is, the type-field is the + * last field that is written on any handle. If it has not been + * set, we must not access the handle here. + */ + type = handle->type; + smp_rmb(); + + if (type == KDBUS_HANDLE_EP_OWNER) + ret = kdbus_handle_ioctl_ep_owner(file, cmd, argp); + else if (type == KDBUS_HANDLE_CONNECTED) + ret = kdbus_handle_ioctl_connected(file, cmd, argp); + + break; + } + default: + ret = -ENOTTY; + break; + } + + return ret < 0 ? ret : 0; +} + +static unsigned int kdbus_handle_poll(struct file *file, + struct poll_table_struct *wait) +{ + struct kdbus_handle *handle = file->private_data; + enum kdbus_handle_type type; + unsigned int mask = POLLOUT | POLLWRNORM; + + /* + * This pairs with smp_wmb() during handle setup. It guarantees that + * _iff_ the handle type is set, handle->conn is valid. Furthermore, + * _iff_ the type is set, the handle object is constant and never + * changed again. If it's not set, we must not access the handle but + * bail out. We also must assume no setup has taken place, yet. + */ + type = handle->type; + smp_rmb(); + + /* Only a connected endpoint can read/write data */ + if (type != KDBUS_HANDLE_CONNECTED) + return POLLERR | POLLHUP; + + poll_wait(file, &handle->conn->wait, wait); + + /* + * Verify the connection hasn't been deactivated _after_ adding the + * wait-queue. This guarantees, that if the connection is deactivated + * after we checked it, the waitqueue is signaled and we're called + * again. + */ + if (!kdbus_conn_active(handle->conn)) + return POLLERR | POLLHUP; + + if (!list_empty(&handle->conn->queue.msg_list) || + atomic_read(&handle->conn->lost_count) > 0) + mask |= POLLIN | POLLRDNORM; + + return mask; +} + +static int kdbus_handle_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct kdbus_handle *handle = file->private_data; + enum kdbus_handle_type type; + int ret = -EBADFD; + + /* + * This pairs with smp_wmb() during handle setup. It guarantees that + * _iff_ the handle type is set, handle->conn is valid. Furthermore, + * _iff_ the type is set, the handle object is constant and never + * changed again. If it's not set, we must not access the handle but + * bail out. We also must assume no setup has taken place, yet. + */ + type = handle->type; + smp_rmb(); + + /* Only connected handles have a pool we can map */ + if (type == KDBUS_HANDLE_CONNECTED) + ret = kdbus_pool_mmap(handle->conn->pool, vma); + + return ret; +} + +const struct file_operations kdbus_handle_ops = { + .owner = THIS_MODULE, + .open = kdbus_handle_open, + .release = kdbus_handle_release, + .poll = kdbus_handle_poll, + .llseek = noop_llseek, + .unlocked_ioctl = kdbus_handle_ioctl, + .mmap = kdbus_handle_mmap, +#ifdef CONFIG_COMPAT + .compat_ioctl = kdbus_handle_ioctl, +#endif +}; diff -Nur linux-4.2/ipc/kdbus/handle.h linux-4.2-pck/ipc/kdbus/handle.h --- linux-4.2/ipc/kdbus/handle.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/handle.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_HANDLE_H +#define __KDBUS_HANDLE_H + +#include +#include + +extern const struct file_operations kdbus_handle_ops; + +/** + * kdbus_arg - information and state of a single ioctl command item + * @type: item type + * @item: set by the parser to the first found item of this type + * @multiple: whether multiple items of this type are allowed + * @mandatory: whether at least one item of this type is required + * + * This structure describes a single item in an ioctl command payload. The + * caller has to pre-fill the type and flags, the parser will then use this + * information to verify the ioctl payload. @item is set by the parser to point + * to the first occurrence of the item. + */ +struct kdbus_arg { + u64 type; + struct kdbus_item *item; + bool multiple : 1; + bool mandatory : 1; +}; + +/** + * kdbus_args - information and state of ioctl command parser + * @allowed_flags: set of flags this command supports + * @argc: number of items in @argv + * @argv: array of items this command supports + * @user: set by parser to user-space location of current command + * @cmd: set by parser to kernel copy of command payload + * @cmd_buf: inline buf to avoid kmalloc() on small cmds + * @items: points to item array in @cmd + * @items_size: size of @items in bytes + * @is_cmd: whether this is a command-payload or msg-payload + * + * This structure is used to parse ioctl command payloads on each invocation. + * The ioctl handler has to pre-fill the flags and allowed items before passing + * the object to kdbus_args_parse(). The parser will copy the command payload + * into kernel-space and verify the correctness of the data. + * + * We use a 256 bytes buffer for small command payloads, to be allocated on + * stack on syscall entrance. + */ +struct kdbus_args { + u64 allowed_flags; + size_t argc; + struct kdbus_arg *argv; + + struct kdbus_cmd __user *user; + struct kdbus_cmd *cmd; + u8 cmd_buf[256]; + + struct kdbus_item *items; + size_t items_size; + bool is_cmd : 1; +}; + +int __kdbus_args_parse(struct kdbus_args *args, bool is_cmd, void __user *argp, + size_t type_size, size_t items_offset, void **out); +int kdbus_args_clear(struct kdbus_args *args, int ret); + +#define kdbus_args_parse(_args, _argp, _v) \ + ({ \ + BUILD_BUG_ON(offsetof(typeof(**(_v)), size) != \ + offsetof(struct kdbus_cmd, size)); \ + BUILD_BUG_ON(offsetof(typeof(**(_v)), flags) != \ + offsetof(struct kdbus_cmd, flags)); \ + BUILD_BUG_ON(offsetof(typeof(**(_v)), return_flags) != \ + offsetof(struct kdbus_cmd, return_flags)); \ + __kdbus_args_parse((_args), 1, (_argp), sizeof(**(_v)), \ + offsetof(typeof(**(_v)), items), \ + (void **)(_v)); \ + }) + +#define kdbus_args_parse_msg(_args, _argp, _v) \ + ({ \ + BUILD_BUG_ON(offsetof(typeof(**(_v)), size) != \ + offsetof(struct kdbus_cmd, size)); \ + BUILD_BUG_ON(offsetof(typeof(**(_v)), flags) != \ + offsetof(struct kdbus_cmd, flags)); \ + __kdbus_args_parse((_args), 0, (_argp), sizeof(**(_v)), \ + offsetof(typeof(**(_v)), items), \ + (void **)(_v)); \ + }) + +#endif diff -Nur linux-4.2/ipc/kdbus/item.c linux-4.2-pck/ipc/kdbus/item.c --- linux-4.2/ipc/kdbus/item.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/item.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,293 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include + +#include "item.h" +#include "limits.h" +#include "util.h" + +/* + * This verifies the string at position @str with size @size is properly + * zero-terminated and does not contain a 0-byte but at the end. + */ +static bool kdbus_str_valid(const char *str, size_t size) +{ + return size > 0 && memchr(str, '\0', size) == str + size - 1; +} + +/** + * kdbus_item_validate_name() - validate an item containing a name + * @item: Item to validate + * + * Return: zero on success or an negative error code on failure + */ +int kdbus_item_validate_name(const struct kdbus_item *item) +{ + const char *name = item->str; + unsigned int i; + size_t len; + + if (item->size < KDBUS_ITEM_HEADER_SIZE + 2) + return -EINVAL; + + if (item->size > KDBUS_ITEM_HEADER_SIZE + + KDBUS_SYSNAME_MAX_LEN + 1) + return -ENAMETOOLONG; + + if (!kdbus_str_valid(name, KDBUS_ITEM_PAYLOAD_SIZE(item))) + return -EINVAL; + + len = strlen(name); + if (len == 0) + return -EINVAL; + + for (i = 0; i < len; i++) { + if (isalpha(name[i])) + continue; + if (isdigit(name[i])) + continue; + if (name[i] == '_') + continue; + if (i > 0 && i + 1 < len && (name[i] == '-' || name[i] == '.')) + continue; + + return -EINVAL; + } + + return 0; +} + +/** + * kdbus_item_validate() - validate a single item + * @item: item to validate + * + * Return: 0 if item is valid, negative error code if not. + */ +int kdbus_item_validate(const struct kdbus_item *item) +{ + size_t payload_size = KDBUS_ITEM_PAYLOAD_SIZE(item); + size_t l; + int ret; + + BUILD_BUG_ON(KDBUS_ITEM_HEADER_SIZE != + sizeof(struct kdbus_item_header)); + + if (item->size < KDBUS_ITEM_HEADER_SIZE) + return -EINVAL; + + switch (item->type) { + case KDBUS_ITEM_NEGOTIATE: + if (payload_size % sizeof(u64) != 0) + return -EINVAL; + break; + + case KDBUS_ITEM_PAYLOAD_VEC: + case KDBUS_ITEM_PAYLOAD_OFF: + if (payload_size != sizeof(struct kdbus_vec)) + return -EINVAL; + if (item->vec.size == 0 || item->vec.size > SIZE_MAX) + return -EINVAL; + break; + + case KDBUS_ITEM_PAYLOAD_MEMFD: + if (payload_size != sizeof(struct kdbus_memfd)) + return -EINVAL; + if (item->memfd.size == 0 || item->memfd.size > SIZE_MAX) + return -EINVAL; + if (item->memfd.fd < 0) + return -EBADF; + break; + + case KDBUS_ITEM_FDS: + if (payload_size % sizeof(int) != 0) + return -EINVAL; + break; + + case KDBUS_ITEM_CANCEL_FD: + if (payload_size != sizeof(int)) + return -EINVAL; + break; + + case KDBUS_ITEM_BLOOM_PARAMETER: + if (payload_size != sizeof(struct kdbus_bloom_parameter)) + return -EINVAL; + break; + + case KDBUS_ITEM_BLOOM_FILTER: + /* followed by the bloom-mask, depends on the bloom-size */ + if (payload_size < sizeof(struct kdbus_bloom_filter)) + return -EINVAL; + break; + + case KDBUS_ITEM_BLOOM_MASK: + /* size depends on bloom-size of bus */ + break; + + case KDBUS_ITEM_CONN_DESCRIPTION: + case KDBUS_ITEM_MAKE_NAME: + ret = kdbus_item_validate_name(item); + if (ret < 0) + return ret; + break; + + case KDBUS_ITEM_ATTACH_FLAGS_SEND: + case KDBUS_ITEM_ATTACH_FLAGS_RECV: + case KDBUS_ITEM_ID: + case KDBUS_ITEM_DST_ID: + if (payload_size != sizeof(u64)) + return -EINVAL; + break; + + case KDBUS_ITEM_TIMESTAMP: + if (payload_size != sizeof(struct kdbus_timestamp)) + return -EINVAL; + break; + + case KDBUS_ITEM_CREDS: + if (payload_size != sizeof(struct kdbus_creds)) + return -EINVAL; + break; + + case KDBUS_ITEM_AUXGROUPS: + if (payload_size % sizeof(u32) != 0) + return -EINVAL; + break; + + case KDBUS_ITEM_NAME: + case KDBUS_ITEM_DST_NAME: + case KDBUS_ITEM_PID_COMM: + case KDBUS_ITEM_TID_COMM: + case KDBUS_ITEM_EXE: + case KDBUS_ITEM_CMDLINE: + case KDBUS_ITEM_CGROUP: + case KDBUS_ITEM_SECLABEL: + if (!kdbus_str_valid(item->str, payload_size)) + return -EINVAL; + break; + + case KDBUS_ITEM_CAPS: + if (payload_size < sizeof(u32)) + return -EINVAL; + if (payload_size < sizeof(u32) + + 4 * CAP_TO_INDEX(item->caps.last_cap) * sizeof(u32)) + return -EINVAL; + break; + + case KDBUS_ITEM_AUDIT: + if (payload_size != sizeof(struct kdbus_audit)) + return -EINVAL; + break; + + case KDBUS_ITEM_POLICY_ACCESS: + if (payload_size != sizeof(struct kdbus_policy_access)) + return -EINVAL; + break; + + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_REMOVE: + case KDBUS_ITEM_NAME_CHANGE: + if (payload_size < sizeof(struct kdbus_notify_name_change)) + return -EINVAL; + l = payload_size - offsetof(struct kdbus_notify_name_change, + name); + if (l > 0 && !kdbus_str_valid(item->name_change.name, l)) + return -EINVAL; + break; + + case KDBUS_ITEM_ID_ADD: + case KDBUS_ITEM_ID_REMOVE: + if (payload_size != sizeof(struct kdbus_notify_id_change)) + return -EINVAL; + break; + + case KDBUS_ITEM_REPLY_TIMEOUT: + case KDBUS_ITEM_REPLY_DEAD: + if (payload_size != 0) + return -EINVAL; + break; + + default: + break; + } + + return 0; +} + +/** + * kdbus_items_validate() - validate items passed by user-space + * @items: items to validate + * @items_size: number of items + * + * This verifies that the passed items pointer is consistent and valid. + * Furthermore, each item is checked for: + * - valid "size" value + * - payload is of expected type + * - payload is fully included in the item + * - string payloads are zero-terminated + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_items_validate(const struct kdbus_item *items, size_t items_size) +{ + const struct kdbus_item *item; + int ret; + + KDBUS_ITEMS_FOREACH(item, items, items_size) { + if (!KDBUS_ITEM_VALID(item, items, items_size)) + return -EINVAL; + + ret = kdbus_item_validate(item); + if (ret < 0) + return ret; + } + + if (!KDBUS_ITEMS_END(item, items, items_size)) + return -EINVAL; + + return 0; +} + +/** + * kdbus_item_set() - Set item content + * @item: The item to modify + * @type: The item type to set (KDBUS_ITEM_*) + * @data: Data to copy to item->data, may be %NULL + * @len: Number of bytes in @data + * + * This sets type, size and data fields of an item. If @data is NULL, the data + * memory is cleared. + * + * Note that you must align your @data memory to 8 bytes. Trailing padding (in + * case @len is not 8byte aligned) is cleared by this call. + * + * Returns: Pointer to the following item. + */ +struct kdbus_item *kdbus_item_set(struct kdbus_item *item, u64 type, + const void *data, size_t len) +{ + item->type = type; + item->size = KDBUS_ITEM_HEADER_SIZE + len; + + if (data) { + memcpy(item->data, data, len); + memset(item->data + len, 0, KDBUS_ALIGN8(len) - len); + } else { + memset(item->data, 0, KDBUS_ALIGN8(len)); + } + + return KDBUS_ITEM_NEXT(item); +} diff -Nur linux-4.2/ipc/kdbus/item.h linux-4.2-pck/ipc/kdbus/item.h --- linux-4.2/ipc/kdbus/item.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/item.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_ITEM_H +#define __KDBUS_ITEM_H + +#include +#include + +#include "util.h" + +/* generic access and iterators over a stream of items */ +#define KDBUS_ITEM_NEXT(_i) (typeof(_i))((u8 *)(_i) + KDBUS_ALIGN8((_i)->size)) +#define KDBUS_ITEMS_SIZE(_h, _is) ((_h)->size - offsetof(typeof(*(_h)), _is)) +#define KDBUS_ITEM_HEADER_SIZE offsetof(struct kdbus_item, data) +#define KDBUS_ITEM_SIZE(_s) KDBUS_ALIGN8(KDBUS_ITEM_HEADER_SIZE + (_s)) +#define KDBUS_ITEM_PAYLOAD_SIZE(_i) ((_i)->size - KDBUS_ITEM_HEADER_SIZE) + +#define KDBUS_ITEMS_FOREACH(_i, _is, _s) \ + for ((_i) = (_is); \ + ((u8 *)(_i) < (u8 *)(_is) + (_s)) && \ + ((u8 *)(_i) >= (u8 *)(_is)); \ + (_i) = KDBUS_ITEM_NEXT(_i)) + +#define KDBUS_ITEM_VALID(_i, _is, _s) \ + ((_i)->size >= KDBUS_ITEM_HEADER_SIZE && \ + (u8 *)(_i) + (_i)->size > (u8 *)(_i) && \ + (u8 *)(_i) + (_i)->size <= (u8 *)(_is) + (_s) && \ + (u8 *)(_i) >= (u8 *)(_is)) + +#define KDBUS_ITEMS_END(_i, _is, _s) \ + ((u8 *)(_i) == ((u8 *)(_is) + KDBUS_ALIGN8(_s))) + +/** + * struct kdbus_item_header - Describes the fix part of an item + * @size: The total size of the item + * @type: The item type, one of KDBUS_ITEM_* + */ +struct kdbus_item_header { + u64 size; + u64 type; +}; + +int kdbus_item_validate_name(const struct kdbus_item *item); +int kdbus_item_validate(const struct kdbus_item *item); +int kdbus_items_validate(const struct kdbus_item *items, size_t items_size); +struct kdbus_item *kdbus_item_set(struct kdbus_item *item, u64 type, + const void *data, size_t len); + +#endif diff -Nur linux-4.2/ipc/kdbus/limits.h linux-4.2-pck/ipc/kdbus/limits.h --- linux-4.2/ipc/kdbus/limits.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/limits.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_DEFAULTS_H +#define __KDBUS_DEFAULTS_H + +#include + +/* maximum size of message header and items */ +#define KDBUS_MSG_MAX_SIZE SZ_8K + +/* maximum number of memfd items per message */ +#define KDBUS_MSG_MAX_MEMFD_ITEMS 16 + +/* max size of ioctl command data */ +#define KDBUS_CMD_MAX_SIZE SZ_32K + +/* maximum number of inflight fds in a target queue per user */ +#define KDBUS_CONN_MAX_FDS_PER_USER 16 + +/* maximum message payload size */ +#define KDBUS_MSG_MAX_PAYLOAD_VEC_SIZE SZ_2M + +/* maximum size of bloom bit field in bytes */ +#define KDBUS_BUS_BLOOM_MAX_SIZE SZ_4K + +/* maximum length of well-known bus name */ +#define KDBUS_NAME_MAX_LEN 255 + +/* maximum length of bus, domain, ep name */ +#define KDBUS_SYSNAME_MAX_LEN 63 + +/* maximum number of matches per connection */ +#define KDBUS_MATCH_MAX 4096 + +/* maximum number of queued messages from the same individual user */ +#define KDBUS_CONN_MAX_MSGS 256 + +/* maximum number of well-known names per connection */ +#define KDBUS_CONN_MAX_NAMES 256 + +/* maximum number of queued requests waiting for a reply */ +#define KDBUS_CONN_MAX_REQUESTS_PENDING 128 + +/* maximum number of connections per user in one domain */ +#define KDBUS_USER_MAX_CONN 1024 + +/* maximum number of buses per user in one domain */ +#define KDBUS_USER_MAX_BUSES 16 + +#endif diff -Nur linux-4.2/ipc/kdbus/main.c linux-4.2-pck/ipc/kdbus/main.c --- linux-4.2/ipc/kdbus/main.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/main.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include + +#include "util.h" +#include "fs.h" +#include "handle.h" +#include "metadata.h" +#include "node.h" + +/* + * This is a simplified outline of the internal kdbus object relations, for + * those interested in the inner life of the driver implementation. + * + * From a mount point's (domain's) perspective: + * + * struct kdbus_domain + * |» struct kdbus_user *user (many, owned) + * '» struct kdbus_node node (embedded) + * |» struct kdbus_node children (many, referenced) + * |» struct kdbus_node *parent (pinned) + * '» struct kdbus_bus (many, pinned) + * |» struct kdbus_node node (embedded) + * '» struct kdbus_ep (many, pinned) + * |» struct kdbus_node node (embedded) + * |» struct kdbus_bus *bus (pinned) + * |» struct kdbus_conn conn_list (many, pinned) + * | |» struct kdbus_ep *ep (pinned) + * | |» struct kdbus_name_entry *activator_of (owned) + * | |» struct kdbus_match_db *match_db (owned) + * | |» struct kdbus_meta *meta (owned) + * | |» struct kdbus_match_db *match_db (owned) + * | | '» struct kdbus_match_entry (many, owned) + * | | + * | |» struct kdbus_pool *pool (owned) + * | | '» struct kdbus_pool_slice *slices (many, owned) + * | | '» struct kdbus_pool *pool (pinned) + * | | + * | |» struct kdbus_user *user (pinned) + * | `» struct kdbus_queue_entry entries (many, embedded) + * | |» struct kdbus_pool_slice *slice (pinned) + * | |» struct kdbus_conn_reply *reply (owned) + * | '» struct kdbus_user *user (pinned) + * | + * '» struct kdbus_user *user (pinned) + * '» struct kdbus_policy_db policy_db (embedded) + * |» struct kdbus_policy_db_entry (many, owned) + * | |» struct kdbus_conn (pinned) + * | '» struct kdbus_ep (pinned) + * | + * '» struct kdbus_policy_db_cache_entry (many, owned) + * '» struct kdbus_conn (pinned) + * + * For the life-time of a file descriptor derived from calling open() on a file + * inside the mount point: + * + * struct kdbus_handle + * |» struct kdbus_meta *meta (owned) + * |» struct kdbus_ep *ep (pinned) + * |» struct kdbus_conn *conn (owned) + * '» struct kdbus_ep *ep (owned) + */ + +static int __init kdbus_init(void) +{ + int ret; + + ret = sysfs_create_mount_point(fs_kobj, KBUILD_MODNAME); + if (ret) + return ret; + + ret = kdbus_fs_init(); + if (ret < 0) { + pr_err("cannot register filesystem: %d\n", ret); + goto exit_dir; + } + + pr_info("initialized\n"); + return 0; + +exit_dir: + sysfs_remove_mount_point(fs_kobj, KBUILD_MODNAME); + return ret; +} + +static void __exit kdbus_exit(void) +{ + kdbus_fs_exit(); + sysfs_remove_mount_point(fs_kobj, KBUILD_MODNAME); + ida_destroy(&kdbus_node_ida); +} + +module_init(kdbus_init); +module_exit(kdbus_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("D-Bus, powerful, easy to use interprocess communication"); +MODULE_ALIAS_FS(KBUILD_MODNAME "fs"); diff -Nur linux-4.2/ipc/kdbus/match.c linux-4.2-pck/ipc/kdbus/match.c --- linux-4.2/ipc/kdbus/match.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/match.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,546 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "handle.h" +#include "item.h" +#include "match.h" +#include "message.h" +#include "names.h" + +/** + * struct kdbus_match_db - message filters + * @entries_list: List of matches + * @mdb_rwlock: Match data lock + * @entries_count: Number of entries in database + */ +struct kdbus_match_db { + struct list_head entries_list; + struct rw_semaphore mdb_rwlock; + unsigned int entries_count; +}; + +/** + * struct kdbus_match_entry - a match database entry + * @cookie: User-supplied cookie to lookup the entry + * @list_entry: The list entry element for the db list + * @rules_list: The list head for tracking rules of this entry + */ +struct kdbus_match_entry { + u64 cookie; + struct list_head list_entry; + struct list_head rules_list; +}; + +/** + * struct kdbus_bloom_mask - mask to match against filter + * @generations: Number of generations carried + * @data: Array of bloom bit fields + */ +struct kdbus_bloom_mask { + u64 generations; + u64 *data; +}; + +/** + * struct kdbus_match_rule - a rule appended to a match entry + * @type: An item type to match against + * @bloom_mask: Bloom mask to match a message's filter against, used + * with KDBUS_ITEM_BLOOM_MASK + * @name: Name to match against, used with KDBUS_ITEM_NAME, + * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE} + * @old_id: ID to match against, used with + * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE}, + * KDBUS_ITEM_ID_REMOVE + * @new_id: ID to match against, used with + * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE}, + * KDBUS_ITEM_ID_REMOVE + * @src_id: ID to match against, used with KDBUS_ITEM_ID + * @dst_id: Message destination ID, used with KDBUS_ITEM_DST_ID + * @rules_entry: Entry in the entry's rules list + */ +struct kdbus_match_rule { + u64 type; + union { + struct kdbus_bloom_mask bloom_mask; + struct { + char *name; + u64 old_id; + u64 new_id; + }; + u64 src_id; + u64 dst_id; + }; + struct list_head rules_entry; +}; + +static void kdbus_match_rule_free(struct kdbus_match_rule *rule) +{ + if (!rule) + return; + + switch (rule->type) { + case KDBUS_ITEM_BLOOM_MASK: + kfree(rule->bloom_mask.data); + break; + + case KDBUS_ITEM_NAME: + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_REMOVE: + case KDBUS_ITEM_NAME_CHANGE: + kfree(rule->name); + break; + + case KDBUS_ITEM_ID: + case KDBUS_ITEM_DST_ID: + case KDBUS_ITEM_ID_ADD: + case KDBUS_ITEM_ID_REMOVE: + break; + + default: + BUG(); + } + + list_del(&rule->rules_entry); + kfree(rule); +} + +static void kdbus_match_entry_free(struct kdbus_match_entry *entry) +{ + struct kdbus_match_rule *r, *tmp; + + if (!entry) + return; + + list_for_each_entry_safe(r, tmp, &entry->rules_list, rules_entry) + kdbus_match_rule_free(r); + + list_del(&entry->list_entry); + kfree(entry); +} + +/** + * kdbus_match_db_free() - free match db resources + * @mdb: The match database + */ +void kdbus_match_db_free(struct kdbus_match_db *mdb) +{ + struct kdbus_match_entry *entry, *tmp; + + if (!mdb) + return; + + list_for_each_entry_safe(entry, tmp, &mdb->entries_list, list_entry) + kdbus_match_entry_free(entry); + + kfree(mdb); +} + +/** + * kdbus_match_db_new() - create a new match database + * + * Return: a new kdbus_match_db on success, ERR_PTR on failure. + */ +struct kdbus_match_db *kdbus_match_db_new(void) +{ + struct kdbus_match_db *d; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return ERR_PTR(-ENOMEM); + + init_rwsem(&d->mdb_rwlock); + INIT_LIST_HEAD(&d->entries_list); + + return d; +} + +static bool kdbus_match_bloom(const struct kdbus_bloom_filter *filter, + const struct kdbus_bloom_mask *mask, + const struct kdbus_conn *conn) +{ + size_t n = conn->ep->bus->bloom.size / sizeof(u64); + const u64 *m; + size_t i; + + /* + * The message's filter carries a generation identifier, the + * match's mask possibly carries an array of multiple generations + * of the mask. Select the mask with the closest match of the + * filter's generation. + */ + m = mask->data + (min(filter->generation, mask->generations - 1) * n); + + /* + * The message's filter contains the messages properties, + * the match's mask contains the properties to look for in the + * message. Check the mask bit field against the filter bit field, + * if the message possibly carries the properties the connection + * has subscribed to. + */ + for (i = 0; i < n; i++) + if ((filter->data[i] & m[i]) != m[i]) + return false; + + return true; +} + +static bool kdbus_match_rule_conn(const struct kdbus_match_rule *r, + struct kdbus_conn *c, + const struct kdbus_staging *s) +{ + lockdep_assert_held(&c->ep->bus->name_registry->rwlock); + + switch (r->type) { + case KDBUS_ITEM_BLOOM_MASK: + return kdbus_match_bloom(s->bloom_filter, &r->bloom_mask, c); + case KDBUS_ITEM_ID: + return r->src_id == c->id || r->src_id == KDBUS_MATCH_ID_ANY; + case KDBUS_ITEM_DST_ID: + return r->dst_id == s->msg->dst_id || + r->dst_id == KDBUS_MATCH_ID_ANY; + case KDBUS_ITEM_NAME: + return kdbus_conn_has_name(c, r->name); + default: + return false; + } +} + +static bool kdbus_match_rule_kernel(const struct kdbus_match_rule *r, + const struct kdbus_staging *s) +{ + struct kdbus_item *n = s->notify; + + if (WARN_ON(!n) || n->type != r->type) + return false; + + switch (r->type) { + case KDBUS_ITEM_ID_ADD: + return r->new_id == KDBUS_MATCH_ID_ANY || + r->new_id == n->id_change.id; + case KDBUS_ITEM_ID_REMOVE: + return r->old_id == KDBUS_MATCH_ID_ANY || + r->old_id == n->id_change.id; + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_CHANGE: + case KDBUS_ITEM_NAME_REMOVE: + return (r->old_id == KDBUS_MATCH_ID_ANY || + r->old_id == n->name_change.old_id.id) && + (r->new_id == KDBUS_MATCH_ID_ANY || + r->new_id == n->name_change.new_id.id) && + (!r->name || !strcmp(r->name, n->name_change.name)); + default: + return false; + } +} + +static bool kdbus_match_rules(const struct kdbus_match_entry *entry, + struct kdbus_conn *c, + const struct kdbus_staging *s) +{ + struct kdbus_match_rule *r; + + list_for_each_entry(r, &entry->rules_list, rules_entry) + if ((c && !kdbus_match_rule_conn(r, c, s)) || + (!c && !kdbus_match_rule_kernel(r, s))) + return false; + + return true; +} + +/** + * kdbus_match_db_match_msg() - match a msg object agains the database entries + * @mdb: The match database + * @conn_src: The connection object originating the message + * @staging: Staging object containing the message to match against + * + * This function will walk through all the database entries previously uploaded + * with kdbus_match_db_add(). As soon as any of them has an all-satisfied rule + * set, this function will return true. + * + * The caller must hold the registry lock of conn_src->ep->bus, in case conn_src + * is non-NULL. + * + * Return: true if there was a matching database entry, false otherwise. + */ +bool kdbus_match_db_match_msg(struct kdbus_match_db *mdb, + struct kdbus_conn *conn_src, + const struct kdbus_staging *staging) +{ + struct kdbus_match_entry *entry; + bool matched = false; + + down_read(&mdb->mdb_rwlock); + list_for_each_entry(entry, &mdb->entries_list, list_entry) { + matched = kdbus_match_rules(entry, conn_src, staging); + if (matched) + break; + } + up_read(&mdb->mdb_rwlock); + + return matched; +} + +static int kdbus_match_db_remove_unlocked(struct kdbus_match_db *mdb, + u64 cookie) +{ + struct kdbus_match_entry *entry, *tmp; + bool found = false; + + list_for_each_entry_safe(entry, tmp, &mdb->entries_list, list_entry) + if (entry->cookie == cookie) { + kdbus_match_entry_free(entry); + --mdb->entries_count; + found = true; + } + + return found ? 0 : -EBADSLT; +} + +/** + * kdbus_cmd_match_add() - handle KDBUS_CMD_MATCH_ADD + * @conn: connection to operate on + * @argp: command payload + * + * One call to this function (or one ioctl(KDBUS_CMD_MATCH_ADD), respectively, + * adds one new database entry with n rules attached to it. Each rule is + * described with an kdbus_item, and an entry is considered matching if all + * its rules are satisfied. + * + * The items attached to a kdbus_cmd_match struct have the following mapping: + * + * KDBUS_ITEM_BLOOM_MASK: A bloom mask + * KDBUS_ITEM_NAME: A connection's source name + * KDBUS_ITEM_ID: A connection ID + * KDBUS_ITEM_DST_ID: A connection ID + * KDBUS_ITEM_NAME_ADD: + * KDBUS_ITEM_NAME_REMOVE: + * KDBUS_ITEM_NAME_CHANGE: Well-known name changes, carry + * kdbus_notify_name_change + * KDBUS_ITEM_ID_ADD: + * KDBUS_ITEM_ID_REMOVE: Connection ID changes, carry + * kdbus_notify_id_change + * + * For kdbus_notify_{id,name}_change structs, only the ID and name fields + * are looked at when adding an entry. The flags are unused. + * + * Also note that KDBUS_ITEM_BLOOM_MASK, KDBUS_ITEM_NAME, KDBUS_ITEM_ID, + * and KDBUS_ITEM_DST_ID are used to match messages from userspace, while the + * others apply to kernel-generated notifications. + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_match_add(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_match_db *mdb = conn->match_db; + struct kdbus_match_entry *entry = NULL; + struct kdbus_cmd_match *cmd; + struct kdbus_item *item; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_BLOOM_MASK, .multiple = true }, + { .type = KDBUS_ITEM_NAME, .multiple = true }, + { .type = KDBUS_ITEM_ID, .multiple = true }, + { .type = KDBUS_ITEM_DST_ID, .multiple = true }, + { .type = KDBUS_ITEM_NAME_ADD, .multiple = true }, + { .type = KDBUS_ITEM_NAME_REMOVE, .multiple = true }, + { .type = KDBUS_ITEM_NAME_CHANGE, .multiple = true }, + { .type = KDBUS_ITEM_ID_ADD, .multiple = true }, + { .type = KDBUS_ITEM_ID_REMOVE, .multiple = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_MATCH_REPLACE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + ret = -ENOMEM; + goto exit; + } + + entry->cookie = cmd->cookie; + INIT_LIST_HEAD(&entry->list_entry); + INIT_LIST_HEAD(&entry->rules_list); + + KDBUS_ITEMS_FOREACH(item, cmd->items, KDBUS_ITEMS_SIZE(cmd, items)) { + struct kdbus_match_rule *rule; + size_t size = item->size - offsetof(struct kdbus_item, data); + + rule = kzalloc(sizeof(*rule), GFP_KERNEL); + if (!rule) { + ret = -ENOMEM; + goto exit; + } + + rule->type = item->type; + INIT_LIST_HEAD(&rule->rules_entry); + + switch (item->type) { + case KDBUS_ITEM_BLOOM_MASK: { + u64 bsize = conn->ep->bus->bloom.size; + u64 generations; + u64 remainder; + + generations = div64_u64_rem(size, bsize, &remainder); + if (size < bsize || remainder > 0) { + ret = -EDOM; + break; + } + + rule->bloom_mask.data = kmemdup(item->data, + size, GFP_KERNEL); + if (!rule->bloom_mask.data) { + ret = -ENOMEM; + break; + } + + rule->bloom_mask.generations = generations; + break; + } + + case KDBUS_ITEM_NAME: + if (!kdbus_name_is_valid(item->str, false)) { + ret = -EINVAL; + break; + } + + rule->name = kstrdup(item->str, GFP_KERNEL); + if (!rule->name) + ret = -ENOMEM; + + break; + + case KDBUS_ITEM_ID: + rule->src_id = item->id; + break; + + case KDBUS_ITEM_DST_ID: + rule->dst_id = item->id; + break; + + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_REMOVE: + case KDBUS_ITEM_NAME_CHANGE: + rule->old_id = item->name_change.old_id.id; + rule->new_id = item->name_change.new_id.id; + + if (size > sizeof(struct kdbus_notify_name_change)) { + rule->name = kstrdup(item->name_change.name, + GFP_KERNEL); + if (!rule->name) + ret = -ENOMEM; + } + + break; + + case KDBUS_ITEM_ID_ADD: + case KDBUS_ITEM_ID_REMOVE: + if (item->type == KDBUS_ITEM_ID_ADD) + rule->new_id = item->id_change.id; + else + rule->old_id = item->id_change.id; + + break; + } + + if (ret < 0) { + kdbus_match_rule_free(rule); + goto exit; + } + + list_add_tail(&rule->rules_entry, &entry->rules_list); + } + + down_write(&mdb->mdb_rwlock); + + /* Remove any entry that has the same cookie as the current one. */ + if (cmd->flags & KDBUS_MATCH_REPLACE) + kdbus_match_db_remove_unlocked(mdb, entry->cookie); + + /* + * If the above removal caught any entry, there will be room for the + * new one. + */ + if (++mdb->entries_count > KDBUS_MATCH_MAX) { + --mdb->entries_count; + ret = -EMFILE; + } else { + list_add_tail(&entry->list_entry, &mdb->entries_list); + entry = NULL; + } + + up_write(&mdb->mdb_rwlock); + +exit: + kdbus_match_entry_free(entry); + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_match_remove() - handle KDBUS_CMD_MATCH_REMOVE + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_match_remove(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_cmd_match *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + down_write(&conn->match_db->mdb_rwlock); + ret = kdbus_match_db_remove_unlocked(conn->match_db, cmd->cookie); + up_write(&conn->match_db->mdb_rwlock); + + return kdbus_args_clear(&args, ret); +} diff -Nur linux-4.2/ipc/kdbus/match.h linux-4.2-pck/ipc/kdbus/match.h --- linux-4.2/ipc/kdbus/match.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/match.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_MATCH_H +#define __KDBUS_MATCH_H + +struct kdbus_conn; +struct kdbus_match_db; +struct kdbus_staging; + +struct kdbus_match_db *kdbus_match_db_new(void); +void kdbus_match_db_free(struct kdbus_match_db *db); +int kdbus_match_db_add(struct kdbus_conn *conn, + struct kdbus_cmd_match *cmd); +int kdbus_match_db_remove(struct kdbus_conn *conn, + struct kdbus_cmd_match *cmd); +bool kdbus_match_db_match_msg(struct kdbus_match_db *db, + struct kdbus_conn *conn_src, + const struct kdbus_staging *staging); + +int kdbus_cmd_match_add(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_match_remove(struct kdbus_conn *conn, void __user *argp); + +#endif diff -Nur linux-4.2/ipc/kdbus/message.c linux-4.2-pck/ipc/kdbus/message.c --- linux-4.2/ipc/kdbus/message.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/message.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,1040 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "domain.h" +#include "endpoint.h" +#include "handle.h" +#include "item.h" +#include "match.h" +#include "message.h" +#include "names.h" +#include "policy.h" + +static const char * const zeros = "\0\0\0\0\0\0\0"; + +static struct kdbus_gaps *kdbus_gaps_new(size_t n_memfds, size_t n_fds) +{ + size_t size_offsets, size_memfds, size_fds, size; + struct kdbus_gaps *gaps; + + size_offsets = n_memfds * sizeof(*gaps->memfd_offsets); + size_memfds = n_memfds * sizeof(*gaps->memfd_files); + size_fds = n_fds * sizeof(*gaps->fd_files); + size = sizeof(*gaps) + size_offsets + size_memfds + size_fds; + + gaps = kzalloc(size, GFP_KERNEL); + if (!gaps) + return ERR_PTR(-ENOMEM); + + kref_init(&gaps->kref); + gaps->n_memfds = 0; /* we reserve n_memfds, but don't enforce them */ + gaps->memfd_offsets = (void *)(gaps + 1); + gaps->memfd_files = (void *)((u8 *)gaps->memfd_offsets + size_offsets); + gaps->n_fds = 0; /* we reserve n_fds, but don't enforce them */ + gaps->fd_files = (void *)((u8 *)gaps->memfd_files + size_memfds); + + return gaps; +} + +static void kdbus_gaps_free(struct kref *kref) +{ + struct kdbus_gaps *gaps = container_of(kref, struct kdbus_gaps, kref); + size_t i; + + for (i = 0; i < gaps->n_fds; ++i) + if (gaps->fd_files[i]) + fput(gaps->fd_files[i]); + for (i = 0; i < gaps->n_memfds; ++i) + if (gaps->memfd_files[i]) + fput(gaps->memfd_files[i]); + + kfree(gaps); +} + +/** + * kdbus_gaps_ref() - gain reference + * @gaps: gaps object + * + * Return: @gaps is returned + */ +struct kdbus_gaps *kdbus_gaps_ref(struct kdbus_gaps *gaps) +{ + if (gaps) + kref_get(&gaps->kref); + return gaps; +} + +/** + * kdbus_gaps_unref() - drop reference + * @gaps: gaps object + * + * Return: NULL + */ +struct kdbus_gaps *kdbus_gaps_unref(struct kdbus_gaps *gaps) +{ + if (gaps) + kref_put(&gaps->kref, kdbus_gaps_free); + return NULL; +} + +/** + * kdbus_gaps_install() - install file-descriptors + * @gaps: gaps object, or NULL + * @slice: pool slice that contains the message + * @out_incomplete output variable to note incomplete fds + * + * This function installs all file-descriptors of @gaps into the current + * process and copies the file-descriptor numbers into the target pool slice. + * + * If the file-descriptors were only partially installed, then @out_incomplete + * will be set to true. Otherwise, it's set to false. + * + * Return: 0 on success, negative error code on failure + */ +int kdbus_gaps_install(struct kdbus_gaps *gaps, struct kdbus_pool_slice *slice, + bool *out_incomplete) +{ + bool incomplete_fds = false; + struct kvec kvec; + size_t i, n_fds; + int ret, *fds; + + if (!gaps) { + /* nothing to do */ + *out_incomplete = incomplete_fds; + return 0; + } + + n_fds = gaps->n_fds + gaps->n_memfds; + if (n_fds < 1) { + /* nothing to do */ + *out_incomplete = incomplete_fds; + return 0; + } + + fds = kmalloc_array(n_fds, sizeof(*fds), GFP_TEMPORARY); + n_fds = 0; + if (!fds) + return -ENOMEM; + + /* 1) allocate fds and copy them over */ + + if (gaps->n_fds > 0) { + for (i = 0; i < gaps->n_fds; ++i) { + int fd; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + incomplete_fds = true; + + WARN_ON(!gaps->fd_files[i]); + + fds[n_fds++] = fd < 0 ? -1 : fd; + } + + /* + * The file-descriptor array can only be present once per + * message. Hence, prepare all fds and then copy them over with + * a single kvec. + */ + + WARN_ON(!gaps->fd_offset); + + kvec.iov_base = fds; + kvec.iov_len = gaps->n_fds * sizeof(*fds); + ret = kdbus_pool_slice_copy_kvec(slice, gaps->fd_offset, + &kvec, 1, kvec.iov_len); + if (ret < 0) + goto exit; + } + + for (i = 0; i < gaps->n_memfds; ++i) { + int memfd; + + memfd = get_unused_fd_flags(O_CLOEXEC); + if (memfd < 0) { + incomplete_fds = true; + /* memfds are initialized to -1, skip copying it */ + continue; + } + + fds[n_fds++] = memfd; + + /* + * memfds have to be copied individually as they each are put + * into a separate item. This should not be an issue, though, + * as usually there is no need to send more than one memfd per + * message. + */ + + WARN_ON(!gaps->memfd_offsets[i]); + WARN_ON(!gaps->memfd_files[i]); + + kvec.iov_base = &memfd; + kvec.iov_len = sizeof(memfd); + ret = kdbus_pool_slice_copy_kvec(slice, gaps->memfd_offsets[i], + &kvec, 1, kvec.iov_len); + if (ret < 0) + goto exit; + } + + /* 2) install fds now that everything was successful */ + + for (i = 0; i < gaps->n_fds; ++i) + if (fds[i] >= 0) + fd_install(fds[i], get_file(gaps->fd_files[i])); + for (i = 0; i < gaps->n_memfds; ++i) + if (fds[gaps->n_fds + i] >= 0) + fd_install(fds[gaps->n_fds + i], + get_file(gaps->memfd_files[i])); + + ret = 0; + +exit: + if (ret < 0) + for (i = 0; i < n_fds; ++i) + put_unused_fd(fds[i]); + kfree(fds); + *out_incomplete = incomplete_fds; + return ret; +} + +static struct file *kdbus_get_fd(int fd) +{ + struct file *f, *ret; + struct inode *inode; + struct socket *sock; + + if (fd < 0) + return ERR_PTR(-EBADF); + + f = fget_raw(fd); + if (!f) + return ERR_PTR(-EBADF); + + inode = file_inode(f); + sock = S_ISSOCK(inode->i_mode) ? SOCKET_I(inode) : NULL; + + if (f->f_mode & FMODE_PATH) + ret = f; /* O_PATH is always allowed */ + else if (f->f_op == &kdbus_handle_ops) + ret = ERR_PTR(-EOPNOTSUPP); /* disallow kdbus-fd over kdbus */ + else if (sock && sock->sk && sock->ops && sock->ops->family == PF_UNIX) + ret = ERR_PTR(-EOPNOTSUPP); /* disallow UDS over kdbus */ + else + ret = f; /* all other are allowed */ + + if (f != ret) + fput(f); + + return ret; +} + +static struct file *kdbus_get_memfd(const struct kdbus_memfd *memfd) +{ + const int m = F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_SEAL; + struct file *f, *ret; + int s; + + if (memfd->fd < 0) + return ERR_PTR(-EBADF); + + f = fget(memfd->fd); + if (!f) + return ERR_PTR(-EBADF); + + s = shmem_get_seals(f); + if (s < 0) + ret = ERR_PTR(-EMEDIUMTYPE); + else if ((s & m) != m) + ret = ERR_PTR(-ETXTBSY); + else if (memfd->start + memfd->size > (u64)i_size_read(file_inode(f))) + ret = ERR_PTR(-EFAULT); + else + ret = f; + + if (f != ret) + fput(f); + + return ret; +} + +static int kdbus_msg_examine(struct kdbus_msg *msg, struct kdbus_bus *bus, + struct kdbus_cmd_send *cmd, size_t *out_n_memfds, + size_t *out_n_fds, size_t *out_n_parts) +{ + struct kdbus_item *item, *fds = NULL, *bloom = NULL, *dstname = NULL; + u64 n_parts, n_memfds, n_fds, vec_size; + + /* + * Step 1: + * Validate the message and command parameters. + */ + + /* KDBUS_PAYLOAD_KERNEL is reserved to kernel messages */ + if (msg->payload_type == KDBUS_PAYLOAD_KERNEL) + return -EINVAL; + + if (msg->dst_id == KDBUS_DST_ID_BROADCAST) { + /* broadcasts must be marked as signals */ + if (!(msg->flags & KDBUS_MSG_SIGNAL)) + return -EBADMSG; + /* broadcasts cannot have timeouts */ + if (msg->timeout_ns > 0) + return -ENOTUNIQ; + } + + if (msg->flags & KDBUS_MSG_EXPECT_REPLY) { + /* if you expect a reply, you must specify a timeout */ + if (msg->timeout_ns == 0) + return -EINVAL; + /* signals cannot have replies */ + if (msg->flags & KDBUS_MSG_SIGNAL) + return -ENOTUNIQ; + } else { + /* must expect reply if sent as synchronous call */ + if (cmd->flags & KDBUS_SEND_SYNC_REPLY) + return -EINVAL; + /* cannot mark replies as signal */ + if (msg->cookie_reply && (msg->flags & KDBUS_MSG_SIGNAL)) + return -EINVAL; + } + + /* + * Step 2: + * Validate all passed items. While at it, select some statistics that + * are required to allocate state objects later on. + * + * Generic item validation has already been done via + * kdbus_item_validate(). Furthermore, the number of items is naturally + * limited by the maximum message size. Hence, only non-generic item + * checks are performed here (mainly integer overflow tests). + */ + + n_parts = 0; + n_memfds = 0; + n_fds = 0; + vec_size = 0; + + KDBUS_ITEMS_FOREACH(item, msg->items, KDBUS_ITEMS_SIZE(msg, items)) { + switch (item->type) { + case KDBUS_ITEM_PAYLOAD_VEC: { + void __force __user *ptr = KDBUS_PTR(item->vec.address); + u64 size = item->vec.size; + + if (vec_size + size < vec_size) + return -EMSGSIZE; + if (vec_size + size > KDBUS_MSG_MAX_PAYLOAD_VEC_SIZE) + return -EMSGSIZE; + if (ptr && unlikely(!access_ok(VERIFY_READ, ptr, size))) + return -EFAULT; + + if (ptr || size % 8) /* data or padding */ + ++n_parts; + break; + } + case KDBUS_ITEM_PAYLOAD_MEMFD: { + u64 start = item->memfd.start; + u64 size = item->memfd.size; + + if (start + size < start) + return -EMSGSIZE; + if (n_memfds >= KDBUS_MSG_MAX_MEMFD_ITEMS) + return -E2BIG; + + ++n_memfds; + if (size % 8) /* vec-padding required */ + ++n_parts; + break; + } + case KDBUS_ITEM_FDS: { + if (fds) + return -EEXIST; + + fds = item; + n_fds = KDBUS_ITEM_PAYLOAD_SIZE(item) / sizeof(int); + if (n_fds > KDBUS_CONN_MAX_FDS_PER_USER) + return -EMFILE; + + break; + } + case KDBUS_ITEM_BLOOM_FILTER: { + u64 bloom_size; + + if (bloom) + return -EEXIST; + + bloom = item; + bloom_size = KDBUS_ITEM_PAYLOAD_SIZE(item) - + offsetof(struct kdbus_bloom_filter, data); + if (!KDBUS_IS_ALIGNED8(bloom_size)) + return -EFAULT; + if (bloom_size != bus->bloom.size) + return -EDOM; + + break; + } + case KDBUS_ITEM_DST_NAME: { + if (dstname) + return -EEXIST; + + dstname = item; + if (!kdbus_name_is_valid(item->str, false)) + return -EINVAL; + if (msg->dst_id == KDBUS_DST_ID_BROADCAST) + return -EBADMSG; + + break; + } + default: + return -EINVAL; + } + } + + /* + * Step 3: + * Validate that required items were actually passed, and that no item + * contradicts the message flags. + */ + + /* bloom filters must be attached _iff_ it's a signal */ + if (!(msg->flags & KDBUS_MSG_SIGNAL) != !bloom) + return -EBADMSG; + /* destination name is required if no ID is given */ + if (msg->dst_id == KDBUS_DST_ID_NAME && !dstname) + return -EDESTADDRREQ; + /* cannot send file-descriptors attached to broadcasts */ + if (msg->dst_id == KDBUS_DST_ID_BROADCAST && fds) + return -ENOTUNIQ; + + *out_n_memfds = n_memfds; + *out_n_fds = n_fds; + *out_n_parts = n_parts; + + return 0; +} + +static bool kdbus_staging_merge_vecs(struct kdbus_staging *staging, + struct kdbus_item **prev_item, + struct iovec **prev_vec, + const struct kdbus_item *merge) +{ + void __user *ptr = (void __user *)KDBUS_PTR(merge->vec.address); + u64 padding = merge->vec.size % 8; + struct kdbus_item *prev = *prev_item; + struct iovec *vec = *prev_vec; + + /* XXX: merging is disabled so far */ + if (0 && prev && prev->type == KDBUS_ITEM_PAYLOAD_OFF && + !merge->vec.address == !prev->vec.address) { + /* + * If we merge two VECs, we can always drop the second + * PAYLOAD_VEC item. Hence, include its size in the previous + * one. + */ + prev->vec.size += merge->vec.size; + + if (ptr) { + /* + * If we merge two data VECs, we need two iovecs to copy + * the data. But the items can be easily merged by + * summing their lengths. + */ + vec = &staging->parts[staging->n_parts++]; + vec->iov_len = merge->vec.size; + vec->iov_base = ptr; + staging->n_payload += vec->iov_len; + } else if (padding) { + /* + * If we merge two 0-vecs with the second 0-vec + * requiring padding, we need to insert an iovec to copy + * the 0-padding. We try merging it with the previous + * 0-padding iovec. This might end up with an + * iov_len==0, in which case we simply drop the iovec. + */ + if (vec) { + staging->n_payload -= vec->iov_len; + vec->iov_len = prev->vec.size % 8; + if (!vec->iov_len) { + --staging->n_parts; + vec = NULL; + } else { + staging->n_payload += vec->iov_len; + } + } else { + vec = &staging->parts[staging->n_parts++]; + vec->iov_len = padding; + vec->iov_base = (char __user *)zeros; + staging->n_payload += vec->iov_len; + } + } else { + /* + * If we merge two 0-vecs with the second 0-vec having + * no padding, we know the padding of the first stays + * the same. Hence, @vec needs no adjustment. + */ + } + + /* successfully merged with previous item */ + merge = prev; + } else { + /* + * If we cannot merge the payload item with the previous one, + * we simply insert a new iovec for the data/padding. + */ + if (ptr) { + vec = &staging->parts[staging->n_parts++]; + vec->iov_len = merge->vec.size; + vec->iov_base = ptr; + staging->n_payload += vec->iov_len; + } else if (padding) { + vec = &staging->parts[staging->n_parts++]; + vec->iov_len = padding; + vec->iov_base = (char __user *)zeros; + staging->n_payload += vec->iov_len; + } else { + vec = NULL; + } + } + + *prev_item = (struct kdbus_item *)merge; + *prev_vec = vec; + + return merge == prev; +} + +static int kdbus_staging_import(struct kdbus_staging *staging) +{ + struct kdbus_item *it, *item, *last, *prev_payload; + struct kdbus_gaps *gaps = staging->gaps; + struct kdbus_msg *msg = staging->msg; + struct iovec *part, *prev_part; + bool drop_item; + + drop_item = false; + last = NULL; + prev_payload = NULL; + prev_part = NULL; + + /* + * We modify msg->items along the way; make sure to use @item as offset + * to the next item (instead of the iterator @it). + */ + for (it = item = msg->items; + it >= msg->items && + (u8 *)it < (u8 *)msg + msg->size && + (u8 *)it + it->size <= (u8 *)msg + msg->size; ) { + /* + * If we dropped items along the way, move current item to + * front. We must not access @it afterwards, but use @item + * instead! + */ + if (it != item) + memmove(item, it, it->size); + it = (void *)((u8 *)it + KDBUS_ALIGN8(item->size)); + + switch (item->type) { + case KDBUS_ITEM_PAYLOAD_VEC: { + size_t offset = staging->n_payload; + + if (kdbus_staging_merge_vecs(staging, &prev_payload, + &prev_part, item)) { + drop_item = true; + } else if (item->vec.address) { + /* real offset is patched later on */ + item->type = KDBUS_ITEM_PAYLOAD_OFF; + item->vec.offset = offset; + } else { + item->type = KDBUS_ITEM_PAYLOAD_OFF; + item->vec.offset = ~0ULL; + } + + break; + } + case KDBUS_ITEM_PAYLOAD_MEMFD: { + struct file *f; + + f = kdbus_get_memfd(&item->memfd); + if (IS_ERR(f)) + return PTR_ERR(f); + + gaps->memfd_files[gaps->n_memfds] = f; + gaps->memfd_offsets[gaps->n_memfds] = + (u8 *)&item->memfd.fd - (u8 *)msg; + ++gaps->n_memfds; + + /* memfds cannot be merged */ + prev_payload = item; + prev_part = NULL; + + /* insert padding to make following VECs aligned */ + if (item->memfd.size % 8) { + part = &staging->parts[staging->n_parts++]; + part->iov_len = item->memfd.size % 8; + part->iov_base = (char __user *)zeros; + staging->n_payload += part->iov_len; + } + + break; + } + case KDBUS_ITEM_FDS: { + size_t i, n_fds; + + n_fds = KDBUS_ITEM_PAYLOAD_SIZE(item) / sizeof(int); + for (i = 0; i < n_fds; ++i) { + struct file *f; + + f = kdbus_get_fd(item->fds[i]); + if (IS_ERR(f)) + return PTR_ERR(f); + + gaps->fd_files[gaps->n_fds++] = f; + } + + gaps->fd_offset = (u8 *)item->fds - (u8 *)msg; + + break; + } + case KDBUS_ITEM_BLOOM_FILTER: + staging->bloom_filter = &item->bloom_filter; + break; + case KDBUS_ITEM_DST_NAME: + staging->dst_name = item->str; + break; + } + + /* drop item if we merged it with a previous one */ + if (drop_item) { + drop_item = false; + } else { + last = item; + item = KDBUS_ITEM_NEXT(item); + } + } + + /* adjust message size regarding dropped items */ + msg->size = offsetof(struct kdbus_msg, items); + if (last) + msg->size += ((u8 *)last - (u8 *)msg->items) + last->size; + + return 0; +} + +static void kdbus_staging_reserve(struct kdbus_staging *staging) +{ + struct iovec *part; + + part = &staging->parts[staging->n_parts++]; + part->iov_base = (void __user *)zeros; + part->iov_len = 0; +} + +static struct kdbus_staging *kdbus_staging_new(struct kdbus_bus *bus, + size_t n_parts, + size_t msg_extra_size) +{ + const size_t reserved_parts = 5; /* see below for explanation */ + struct kdbus_staging *staging; + int ret; + + n_parts += reserved_parts; + + staging = kzalloc(sizeof(*staging) + n_parts * sizeof(*staging->parts) + + msg_extra_size, GFP_TEMPORARY); + if (!staging) + return ERR_PTR(-ENOMEM); + + staging->msg_seqnum = atomic64_inc_return(&bus->last_message_id); + staging->n_parts = 0; /* we reserve n_parts, but don't enforce them */ + staging->parts = (void *)(staging + 1); + + if (msg_extra_size) /* if requested, allocate message, too */ + staging->msg = (void *)((u8 *)staging->parts + + n_parts * sizeof(*staging->parts)); + + staging->meta_proc = kdbus_meta_proc_new(); + if (IS_ERR(staging->meta_proc)) { + ret = PTR_ERR(staging->meta_proc); + staging->meta_proc = NULL; + goto error; + } + + staging->meta_conn = kdbus_meta_conn_new(); + if (IS_ERR(staging->meta_conn)) { + ret = PTR_ERR(staging->meta_conn); + staging->meta_conn = NULL; + goto error; + } + + /* + * Prepare iovecs to copy the message into the target pool. We use the + * following iovecs: + * * iovec to copy "kdbus_msg.size" + * * iovec to copy "struct kdbus_msg" (minus size) plus items + * * iovec for possible padding after the items + * * iovec for metadata items + * * iovec for possible padding after the items + * + * Make sure to update @reserved_parts if you add more parts here. + */ + + kdbus_staging_reserve(staging); /* msg.size */ + kdbus_staging_reserve(staging); /* msg (minus msg.size) plus items */ + kdbus_staging_reserve(staging); /* msg padding */ + kdbus_staging_reserve(staging); /* meta */ + kdbus_staging_reserve(staging); /* meta padding */ + + return staging; + +error: + kdbus_staging_free(staging); + return ERR_PTR(ret); +} + +struct kdbus_staging *kdbus_staging_new_kernel(struct kdbus_bus *bus, + u64 dst, u64 cookie_timeout, + size_t it_size, size_t it_type) +{ + struct kdbus_staging *staging; + size_t size; + + size = offsetof(struct kdbus_msg, items) + + KDBUS_ITEM_HEADER_SIZE + it_size; + + staging = kdbus_staging_new(bus, 0, KDBUS_ALIGN8(size)); + if (IS_ERR(staging)) + return ERR_CAST(staging); + + staging->msg->size = size; + staging->msg->flags = (dst == KDBUS_DST_ID_BROADCAST) ? + KDBUS_MSG_SIGNAL : 0; + staging->msg->dst_id = dst; + staging->msg->src_id = KDBUS_SRC_ID_KERNEL; + staging->msg->payload_type = KDBUS_PAYLOAD_KERNEL; + staging->msg->cookie_reply = cookie_timeout; + staging->notify = staging->msg->items; + staging->notify->size = KDBUS_ITEM_HEADER_SIZE + it_size; + staging->notify->type = it_type; + + return staging; +} + +struct kdbus_staging *kdbus_staging_new_user(struct kdbus_bus *bus, + struct kdbus_cmd_send *cmd, + struct kdbus_msg *msg) +{ + const size_t reserved_parts = 1; /* see below for explanation */ + size_t n_memfds, n_fds, n_parts; + struct kdbus_staging *staging; + int ret; + + /* + * Examine user-supplied message and figure out how many resources we + * need to allocate in our staging area. This requires us to iterate + * the message twice, but saves us from re-allocating our resources + * all the time. + */ + + ret = kdbus_msg_examine(msg, bus, cmd, &n_memfds, &n_fds, &n_parts); + if (ret < 0) + return ERR_PTR(ret); + + n_parts += reserved_parts; + + /* + * Allocate staging area with the number of required resources. Make + * sure that we have enough iovecs for all required parts pre-allocated + * so this will hopefully be the only memory allocation for this + * message transaction. + */ + + staging = kdbus_staging_new(bus, n_parts, 0); + if (IS_ERR(staging)) + return ERR_CAST(staging); + + staging->msg = msg; + + /* + * If the message contains memfds or fd items, we need to remember some + * state so we can fill in the requested information at RECV time. + * File-descriptors cannot be passed at SEND time. Hence, allocate a + * gaps-object to remember that state. That gaps object is linked to + * from the staging area, but will also be linked to from the message + * queue of each peer. Hence, each receiver owns a reference to it, and + * it will later be used to fill the 'gaps' in message that couldn't be + * filled at SEND time. + * Note that the 'gaps' object is read-only once the staging-allocator + * returns. There might be connections receiving a queued message while + * the sender still broadcasts the message to other receivers. + */ + + if (n_memfds > 0 || n_fds > 0) { + staging->gaps = kdbus_gaps_new(n_memfds, n_fds); + if (IS_ERR(staging->gaps)) { + ret = PTR_ERR(staging->gaps); + staging->gaps = NULL; + kdbus_staging_free(staging); + return ERR_PTR(ret); + } + } + + /* + * kdbus_staging_new() already reserves parts for message setup. For + * user-supplied messages, we add the following iovecs: + * ... variable number of iovecs for payload ... + * * final iovec for possible padding of payload + * + * Make sure to update @reserved_parts if you add more parts here. + */ + + ret = kdbus_staging_import(staging); /* payload */ + kdbus_staging_reserve(staging); /* payload padding */ + + if (ret < 0) + goto error; + + return staging; + +error: + kdbus_staging_free(staging); + return ERR_PTR(ret); +} + +struct kdbus_staging *kdbus_staging_free(struct kdbus_staging *staging) +{ + if (!staging) + return NULL; + + kdbus_meta_conn_unref(staging->meta_conn); + kdbus_meta_proc_unref(staging->meta_proc); + kdbus_gaps_unref(staging->gaps); + kfree(staging); + + return NULL; +} + +static int kdbus_staging_collect_metadata(struct kdbus_staging *staging, + struct kdbus_conn *src, + struct kdbus_conn *dst, + u64 *out_attach) +{ + u64 attach; + int ret; + + if (src) + attach = kdbus_meta_msg_mask(src, dst); + else + attach = KDBUS_ATTACH_TIMESTAMP; /* metadata for kernel msgs */ + + if (src && !src->meta_fake) { + ret = kdbus_meta_proc_collect(staging->meta_proc, attach); + if (ret < 0) + return ret; + } + + ret = kdbus_meta_conn_collect(staging->meta_conn, src, + staging->msg_seqnum, attach); + if (ret < 0) + return ret; + + *out_attach = attach; + return 0; +} + +/** + * kdbus_staging_emit() - emit linearized message in target pool + * @staging: staging object to create message from + * @src: sender of the message (or NULL) + * @dst: target connection to allocate message for + * + * This allocates a pool-slice for @dst and copies the message provided by + * @staging into it. The new slice is then returned to the caller for further + * processing. It's not linked into any queue, yet. + * + * Return: Newly allocated slice or ERR_PTR on failure. + */ +struct kdbus_pool_slice *kdbus_staging_emit(struct kdbus_staging *staging, + struct kdbus_conn *src, + struct kdbus_conn *dst) +{ + struct kdbus_item *item, *meta_items = NULL; + struct kdbus_pool_slice *slice = NULL; + size_t off, size, meta_size; + struct iovec *v; + u64 attach, msg_size; + int ret; + + /* + * Step 1: + * Collect metadata from @src depending on the attach-flags allowed for + * @dst. Translate it into the namespaces pinned by @dst. + */ + + ret = kdbus_staging_collect_metadata(staging, src, dst, &attach); + if (ret < 0) + goto error; + + ret = kdbus_meta_emit(staging->meta_proc, NULL, staging->meta_conn, + dst, attach, &meta_items, &meta_size); + if (ret < 0) + goto error; + + /* + * Step 2: + * Setup iovecs for the message. See kdbus_staging_new() for allocation + * of those iovecs. All reserved iovecs have been initialized with + * iov_len=0 + iov_base=zeros. Furthermore, the iovecs to copy the + * actual message payload have already been initialized and need not be + * touched. + */ + + v = staging->parts; + msg_size = staging->msg->size; + + /* msg.size */ + v->iov_len = sizeof(msg_size); + v->iov_base = (void __user *)&msg_size; + ++v; + + /* msg (after msg.size) plus items */ + v->iov_len = staging->msg->size - sizeof(staging->msg->size); + v->iov_base = (void __user *)((u8 *)staging->msg + + sizeof(staging->msg->size)); + ++v; + + /* padding after msg */ + v->iov_len = KDBUS_ALIGN8(staging->msg->size) - staging->msg->size; + v->iov_base = (void __user *)zeros; + ++v; + + if (meta_size > 0) { + /* metadata items */ + v->iov_len = meta_size; + v->iov_base = (void __user *)meta_items; + ++v; + + /* padding after metadata */ + v->iov_len = KDBUS_ALIGN8(meta_size) - meta_size; + v->iov_base = (void __user *)zeros; + ++v; + + msg_size = KDBUS_ALIGN8(msg_size) + meta_size; + } else { + /* metadata items */ + v->iov_len = 0; + v->iov_base = (void __user *)zeros; + ++v; + + /* padding after metadata */ + v->iov_len = 0; + v->iov_base = (void __user *)zeros; + ++v; + } + + /* ... payload iovecs are already filled in ... */ + + /* compute overall size and fill in padding after payload */ + size = KDBUS_ALIGN8(msg_size); + + if (staging->n_payload > 0) { + size += staging->n_payload; + + v = &staging->parts[staging->n_parts - 1]; + v->iov_len = KDBUS_ALIGN8(size) - size; + v->iov_base = (void __user *)zeros; + + size = KDBUS_ALIGN8(size); + } + + /* + * Step 3: + * The PAYLOAD_OFF items in the message contain a relative 'offset' + * field that tells the receiver where to find the actual payload. This + * offset is relative to the start of the message, and as such depends + * on the size of the metadata items we inserted. This size is variable + * and changes for each peer we send the message to. Hence, we remember + * the last relative offset that was used to calculate the 'offset' + * fields. For each message, we re-calculate it and patch all items, in + * case it changed. + */ + + off = KDBUS_ALIGN8(msg_size); + + if (off != staging->i_payload) { + KDBUS_ITEMS_FOREACH(item, staging->msg->items, + KDBUS_ITEMS_SIZE(staging->msg, items)) { + if (item->type != KDBUS_ITEM_PAYLOAD_OFF) + continue; + + item->vec.offset -= staging->i_payload; + item->vec.offset += off; + } + + staging->i_payload = off; + } + + /* + * Step 4: + * Allocate pool slice and copy over all data. Make sure to properly + * account on user quota. + */ + + ret = kdbus_conn_quota_inc(dst, src ? src->user : NULL, size, + staging->gaps ? staging->gaps->n_fds : 0); + if (ret < 0) + goto error; + + slice = kdbus_pool_slice_alloc(dst->pool, size, true); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto error; + } + + WARN_ON(kdbus_pool_slice_size(slice) != size); + + ret = kdbus_pool_slice_copy_iovec(slice, 0, staging->parts, + staging->n_parts, size); + if (ret < 0) + goto error; + + /* all done, return slice to caller */ + goto exit; + +error: + if (slice) + kdbus_conn_quota_dec(dst, src ? src->user : NULL, size, + staging->gaps ? staging->gaps->n_fds : 0); + kdbus_pool_slice_release(slice); + slice = ERR_PTR(ret); +exit: + kfree(meta_items); + return slice; +} diff -Nur linux-4.2/ipc/kdbus/message.h linux-4.2-pck/ipc/kdbus/message.h --- linux-4.2/ipc/kdbus/message.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/message.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_MESSAGE_H +#define __KDBUS_MESSAGE_H + +#include +#include +#include + +struct kdbus_bus; +struct kdbus_conn; +struct kdbus_meta_conn; +struct kdbus_meta_proc; +struct kdbus_pool_slice; + +/** + * struct kdbus_gaps - gaps in message to be filled later + * @kref: Reference counter + * @n_memfd_offs: Number of memfds + * @memfd_offs: Offsets of kdbus_memfd items in target slice + * @n_fds: Number of fds + * @fds: Array of sent fds + * @fds_offset: Offset of fd-array in target slice + * + * The 'gaps' object is used to track data that is needed to fill gaps in a + * message at RECV time. Usually, we try to compile the whole message at SEND + * time. This has the advantage, that we don't have to cache any information and + * can keep the memory consumption small. Furthermore, all copy operations can + * be combined into a single function call, which speeds up transactions + * considerably. + * However, things like file-descriptors can only be fully installed at RECV + * time. The gaps object tracks this data and pins it until a message is + * received. The gaps object is shared between all receivers of the same + * message. + */ +struct kdbus_gaps { + struct kref kref; + + /* state tracking for KDBUS_ITEM_PAYLOAD_MEMFD entries */ + size_t n_memfds; + u64 *memfd_offsets; + struct file **memfd_files; + + /* state tracking for KDBUS_ITEM_FDS */ + size_t n_fds; + struct file **fd_files; + u64 fd_offset; +}; + +struct kdbus_gaps *kdbus_gaps_ref(struct kdbus_gaps *gaps); +struct kdbus_gaps *kdbus_gaps_unref(struct kdbus_gaps *gaps); +int kdbus_gaps_install(struct kdbus_gaps *gaps, struct kdbus_pool_slice *slice, + bool *out_incomplete); + +/** + * struct kdbus_staging - staging area to import messages + * @msg: User-supplied message + * @gaps: Gaps-object created during import (or NULL if empty) + * @msg_seqnum: Message sequence number + * @notify_entry: Entry into list of kernel-generated notifications + * @i_payload: Current relative index of start of payload + * @n_payload: Total number of bytes needed for payload + * @n_parts: Number of parts + * @parts: Array of iovecs that make up the whole message + * @meta_proc: Process metadata of the sender (or NULL if empty) + * @meta_conn: Connection metadata of the sender (or NULL if empty) + * @bloom_filter: Pointer to the bloom-item in @msg, or NULL + * @dst_name: Pointer to the dst-name-item in @msg, or NULL + * @notify: Pointer to the notification item in @msg, or NULL + * + * The kdbus_staging object is a temporary staging area to import user-supplied + * messages into the kernel. It is only used during SEND and dropped once the + * message is queued. Any data that cannot be collected during SEND, is + * collected in a kdbus_gaps object and attached to the message queue. + */ +struct kdbus_staging { + struct kdbus_msg *msg; + struct kdbus_gaps *gaps; + u64 msg_seqnum; + struct list_head notify_entry; + + /* crafted iovecs to copy the message */ + size_t i_payload; + size_t n_payload; + size_t n_parts; + struct iovec *parts; + + /* metadata state */ + struct kdbus_meta_proc *meta_proc; + struct kdbus_meta_conn *meta_conn; + + /* cached pointers into @msg */ + const struct kdbus_bloom_filter *bloom_filter; + const char *dst_name; + struct kdbus_item *notify; +}; + +struct kdbus_staging *kdbus_staging_new_kernel(struct kdbus_bus *bus, + u64 dst, u64 cookie_timeout, + size_t it_size, size_t it_type); +struct kdbus_staging *kdbus_staging_new_user(struct kdbus_bus *bus, + struct kdbus_cmd_send *cmd, + struct kdbus_msg *msg); +struct kdbus_staging *kdbus_staging_free(struct kdbus_staging *staging); +struct kdbus_pool_slice *kdbus_staging_emit(struct kdbus_staging *staging, + struct kdbus_conn *src, + struct kdbus_conn *dst); + +#endif diff -Nur linux-4.2/ipc/kdbus/metadata.c linux-4.2-pck/ipc/kdbus/metadata.c --- linux-4.2/ipc/kdbus/metadata.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/metadata.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,1347 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "item.h" +#include "message.h" +#include "metadata.h" +#include "names.h" + +/** + * struct kdbus_meta_proc - Process metadata + * @kref: Reference counting + * @lock: Object lock + * @collected: Bitmask of collected items + * @valid: Bitmask of collected and valid items + * @cred: Credentials + * @pid: PID of process + * @tgid: TGID of process + * @ppid: PPID of process + * @tid_comm: TID comm line + * @pid_comm: PID comm line + * @exe_path: Executable path + * @root_path: Root-FS path + * @cmdline: Command-line + * @cgroup: Full cgroup path + * @seclabel: Seclabel + * @audit_loginuid: Audit login-UID + * @audit_sessionid: Audit session-ID + */ +struct kdbus_meta_proc { + struct kref kref; + struct mutex lock; + u64 collected; + u64 valid; + + /* KDBUS_ITEM_CREDS */ + /* KDBUS_ITEM_AUXGROUPS */ + /* KDBUS_ITEM_CAPS */ + const struct cred *cred; + + /* KDBUS_ITEM_PIDS */ + struct pid *pid; + struct pid *tgid; + struct pid *ppid; + + /* KDBUS_ITEM_TID_COMM */ + char tid_comm[TASK_COMM_LEN]; + /* KDBUS_ITEM_PID_COMM */ + char pid_comm[TASK_COMM_LEN]; + + /* KDBUS_ITEM_EXE */ + struct path exe_path; + struct path root_path; + + /* KDBUS_ITEM_CMDLINE */ + char *cmdline; + + /* KDBUS_ITEM_CGROUP */ + char *cgroup; + + /* KDBUS_ITEM_SECLABEL */ + char *seclabel; + + /* KDBUS_ITEM_AUDIT */ + kuid_t audit_loginuid; + unsigned int audit_sessionid; +}; + +/** + * struct kdbus_meta_conn + * @kref: Reference counting + * @lock: Object lock + * @collected: Bitmask of collected items + * @valid: Bitmask of collected and valid items + * @ts: Timestamp values + * @owned_names_items: Serialized items for owned names + * @owned_names_size: Size of @owned_names_items + * @conn_description: Connection description + */ +struct kdbus_meta_conn { + struct kref kref; + struct mutex lock; + u64 collected; + u64 valid; + + /* KDBUS_ITEM_TIMESTAMP */ + struct kdbus_timestamp ts; + + /* KDBUS_ITEM_OWNED_NAME */ + struct kdbus_item *owned_names_items; + size_t owned_names_size; + + /* KDBUS_ITEM_CONN_DESCRIPTION */ + char *conn_description; +}; + +/* fixed size equivalent of "kdbus_caps" */ +struct kdbus_meta_caps { + u32 last_cap; + struct { + u32 caps[_KERNEL_CAPABILITY_U32S]; + } set[4]; +}; + +/** + * kdbus_meta_proc_new() - Create process metadata object + * + * Return: Pointer to new object on success, ERR_PTR on failure. + */ +struct kdbus_meta_proc *kdbus_meta_proc_new(void) +{ + struct kdbus_meta_proc *mp; + + mp = kzalloc(sizeof(*mp), GFP_KERNEL); + if (!mp) + return ERR_PTR(-ENOMEM); + + kref_init(&mp->kref); + mutex_init(&mp->lock); + + return mp; +} + +static void kdbus_meta_proc_free(struct kref *kref) +{ + struct kdbus_meta_proc *mp = container_of(kref, struct kdbus_meta_proc, + kref); + + path_put(&mp->exe_path); + path_put(&mp->root_path); + if (mp->cred) + put_cred(mp->cred); + put_pid(mp->ppid); + put_pid(mp->tgid); + put_pid(mp->pid); + + kfree(mp->seclabel); + kfree(mp->cmdline); + kfree(mp->cgroup); + kfree(mp); +} + +/** + * kdbus_meta_proc_ref() - Gain reference + * @mp: Process metadata object + * + * Return: @mp is returned + */ +struct kdbus_meta_proc *kdbus_meta_proc_ref(struct kdbus_meta_proc *mp) +{ + if (mp) + kref_get(&mp->kref); + return mp; +} + +/** + * kdbus_meta_proc_unref() - Drop reference + * @mp: Process metadata object + * + * Return: NULL + */ +struct kdbus_meta_proc *kdbus_meta_proc_unref(struct kdbus_meta_proc *mp) +{ + if (mp) + kref_put(&mp->kref, kdbus_meta_proc_free); + return NULL; +} + +static void kdbus_meta_proc_collect_pids(struct kdbus_meta_proc *mp) +{ + struct task_struct *parent; + + mp->pid = get_pid(task_pid(current)); + mp->tgid = get_pid(task_tgid(current)); + + rcu_read_lock(); + parent = rcu_dereference(current->real_parent); + mp->ppid = get_pid(task_tgid(parent)); + rcu_read_unlock(); + + mp->valid |= KDBUS_ATTACH_PIDS; +} + +static void kdbus_meta_proc_collect_tid_comm(struct kdbus_meta_proc *mp) +{ + get_task_comm(mp->tid_comm, current); + mp->valid |= KDBUS_ATTACH_TID_COMM; +} + +static void kdbus_meta_proc_collect_pid_comm(struct kdbus_meta_proc *mp) +{ + get_task_comm(mp->pid_comm, current->group_leader); + mp->valid |= KDBUS_ATTACH_PID_COMM; +} + +static void kdbus_meta_proc_collect_exe(struct kdbus_meta_proc *mp) +{ + struct file *exe_file; + + rcu_read_lock(); + exe_file = rcu_dereference(current->mm->exe_file); + if (exe_file) { + mp->exe_path = exe_file->f_path; + path_get(&mp->exe_path); + get_fs_root(current->fs, &mp->root_path); + mp->valid |= KDBUS_ATTACH_EXE; + } + rcu_read_unlock(); +} + +static int kdbus_meta_proc_collect_cmdline(struct kdbus_meta_proc *mp) +{ + struct mm_struct *mm = current->mm; + char *cmdline; + + if (!mm->arg_end) + return 0; + + cmdline = strndup_user((const char __user *)mm->arg_start, + mm->arg_end - mm->arg_start); + if (IS_ERR(cmdline)) + return PTR_ERR(cmdline); + + mp->cmdline = cmdline; + mp->valid |= KDBUS_ATTACH_CMDLINE; + + return 0; +} + +static int kdbus_meta_proc_collect_cgroup(struct kdbus_meta_proc *mp) +{ +#ifdef CONFIG_CGROUPS + void *page; + char *s; + + page = (void *)__get_free_page(GFP_TEMPORARY); + if (!page) + return -ENOMEM; + + s = task_cgroup_path(current, page, PAGE_SIZE); + if (s) { + mp->cgroup = kstrdup(s, GFP_KERNEL); + if (!mp->cgroup) { + free_page((unsigned long)page); + return -ENOMEM; + } + } + + free_page((unsigned long)page); + mp->valid |= KDBUS_ATTACH_CGROUP; +#endif + + return 0; +} + +static int kdbus_meta_proc_collect_seclabel(struct kdbus_meta_proc *mp) +{ +#ifdef CONFIG_SECURITY + char *ctx = NULL; + u32 sid, len; + int ret; + + security_task_getsecid(current, &sid); + ret = security_secid_to_secctx(sid, &ctx, &len); + if (ret < 0) { + /* + * EOPNOTSUPP means no security module is active, + * lets skip adding the seclabel then. This effectively + * drops the SECLABEL item. + */ + return (ret == -EOPNOTSUPP) ? 0 : ret; + } + + mp->seclabel = kstrdup(ctx, GFP_KERNEL); + security_release_secctx(ctx, len); + if (!mp->seclabel) + return -ENOMEM; + + mp->valid |= KDBUS_ATTACH_SECLABEL; +#endif + + return 0; +} + +static void kdbus_meta_proc_collect_audit(struct kdbus_meta_proc *mp) +{ +#ifdef CONFIG_AUDITSYSCALL + mp->audit_loginuid = audit_get_loginuid(current); + mp->audit_sessionid = audit_get_sessionid(current); + mp->valid |= KDBUS_ATTACH_AUDIT; +#endif +} + +/** + * kdbus_meta_proc_collect() - Collect process metadata + * @mp: Process metadata object + * @what: Attach flags to collect + * + * This collects process metadata from current and saves it in @mp. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_meta_proc_collect(struct kdbus_meta_proc *mp, u64 what) +{ + int ret; + + if (!mp || !(what & (KDBUS_ATTACH_CREDS | + KDBUS_ATTACH_PIDS | + KDBUS_ATTACH_AUXGROUPS | + KDBUS_ATTACH_TID_COMM | + KDBUS_ATTACH_PID_COMM | + KDBUS_ATTACH_EXE | + KDBUS_ATTACH_CMDLINE | + KDBUS_ATTACH_CGROUP | + KDBUS_ATTACH_CAPS | + KDBUS_ATTACH_SECLABEL | + KDBUS_ATTACH_AUDIT))) + return 0; + + mutex_lock(&mp->lock); + + /* creds, auxgrps and caps share "struct cred" as context */ + { + const u64 m_cred = KDBUS_ATTACH_CREDS | + KDBUS_ATTACH_AUXGROUPS | + KDBUS_ATTACH_CAPS; + + if ((what & m_cred) && !(mp->collected & m_cred)) { + mp->cred = get_current_cred(); + mp->valid |= m_cred; + mp->collected |= m_cred; + } + } + + if ((what & KDBUS_ATTACH_PIDS) && + !(mp->collected & KDBUS_ATTACH_PIDS)) { + kdbus_meta_proc_collect_pids(mp); + mp->collected |= KDBUS_ATTACH_PIDS; + } + + if ((what & KDBUS_ATTACH_TID_COMM) && + !(mp->collected & KDBUS_ATTACH_TID_COMM)) { + kdbus_meta_proc_collect_tid_comm(mp); + mp->collected |= KDBUS_ATTACH_TID_COMM; + } + + if ((what & KDBUS_ATTACH_PID_COMM) && + !(mp->collected & KDBUS_ATTACH_PID_COMM)) { + kdbus_meta_proc_collect_pid_comm(mp); + mp->collected |= KDBUS_ATTACH_PID_COMM; + } + + if ((what & KDBUS_ATTACH_EXE) && + !(mp->collected & KDBUS_ATTACH_EXE)) { + kdbus_meta_proc_collect_exe(mp); + mp->collected |= KDBUS_ATTACH_EXE; + } + + if ((what & KDBUS_ATTACH_CMDLINE) && + !(mp->collected & KDBUS_ATTACH_CMDLINE)) { + ret = kdbus_meta_proc_collect_cmdline(mp); + if (ret < 0) + goto exit_unlock; + mp->collected |= KDBUS_ATTACH_CMDLINE; + } + + if ((what & KDBUS_ATTACH_CGROUP) && + !(mp->collected & KDBUS_ATTACH_CGROUP)) { + ret = kdbus_meta_proc_collect_cgroup(mp); + if (ret < 0) + goto exit_unlock; + mp->collected |= KDBUS_ATTACH_CGROUP; + } + + if ((what & KDBUS_ATTACH_SECLABEL) && + !(mp->collected & KDBUS_ATTACH_SECLABEL)) { + ret = kdbus_meta_proc_collect_seclabel(mp); + if (ret < 0) + goto exit_unlock; + mp->collected |= KDBUS_ATTACH_SECLABEL; + } + + if ((what & KDBUS_ATTACH_AUDIT) && + !(mp->collected & KDBUS_ATTACH_AUDIT)) { + kdbus_meta_proc_collect_audit(mp); + mp->collected |= KDBUS_ATTACH_AUDIT; + } + + ret = 0; + +exit_unlock: + mutex_unlock(&mp->lock); + return ret; +} + +/** + * kdbus_meta_fake_new() - Create fake metadata object + * + * Return: Pointer to new object on success, ERR_PTR on failure. + */ +struct kdbus_meta_fake *kdbus_meta_fake_new(void) +{ + struct kdbus_meta_fake *mf; + + mf = kzalloc(sizeof(*mf), GFP_KERNEL); + if (!mf) + return ERR_PTR(-ENOMEM); + + return mf; +} + +/** + * kdbus_meta_fake_free() - Free fake metadata object + * @mf: Fake metadata object + * + * Return: NULL + */ +struct kdbus_meta_fake *kdbus_meta_fake_free(struct kdbus_meta_fake *mf) +{ + if (mf) { + put_pid(mf->ppid); + put_pid(mf->tgid); + put_pid(mf->pid); + kfree(mf->seclabel); + kfree(mf); + } + + return NULL; +} + +/** + * kdbus_meta_fake_collect() - Fill fake metadata from faked credentials + * @mf: Fake metadata object + * @creds: Creds to set, may be %NULL + * @pids: PIDs to set, may be %NULL + * @seclabel: Seclabel to set, may be %NULL + * + * This function takes information stored in @creds, @pids and @seclabel and + * resolves them to kernel-representations, if possible. This call uses the + * current task's namespaces to resolve the given information. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_meta_fake_collect(struct kdbus_meta_fake *mf, + const struct kdbus_creds *creds, + const struct kdbus_pids *pids, + const char *seclabel) +{ + if (mf->valid) + return -EALREADY; + + if (creds) { + struct user_namespace *ns = current_user_ns(); + + mf->uid = make_kuid(ns, creds->uid); + mf->euid = make_kuid(ns, creds->euid); + mf->suid = make_kuid(ns, creds->suid); + mf->fsuid = make_kuid(ns, creds->fsuid); + + mf->gid = make_kgid(ns, creds->gid); + mf->egid = make_kgid(ns, creds->egid); + mf->sgid = make_kgid(ns, creds->sgid); + mf->fsgid = make_kgid(ns, creds->fsgid); + + if ((creds->uid != (uid_t)-1 && !uid_valid(mf->uid)) || + (creds->euid != (uid_t)-1 && !uid_valid(mf->euid)) || + (creds->suid != (uid_t)-1 && !uid_valid(mf->suid)) || + (creds->fsuid != (uid_t)-1 && !uid_valid(mf->fsuid)) || + (creds->gid != (gid_t)-1 && !gid_valid(mf->gid)) || + (creds->egid != (gid_t)-1 && !gid_valid(mf->egid)) || + (creds->sgid != (gid_t)-1 && !gid_valid(mf->sgid)) || + (creds->fsgid != (gid_t)-1 && !gid_valid(mf->fsgid))) + return -EINVAL; + + mf->valid |= KDBUS_ATTACH_CREDS; + } + + if (pids) { + mf->pid = get_pid(find_vpid(pids->tid)); + mf->tgid = get_pid(find_vpid(pids->pid)); + mf->ppid = get_pid(find_vpid(pids->ppid)); + + if ((pids->tid != 0 && !mf->pid) || + (pids->pid != 0 && !mf->tgid) || + (pids->ppid != 0 && !mf->ppid)) { + put_pid(mf->pid); + put_pid(mf->tgid); + put_pid(mf->ppid); + mf->pid = NULL; + mf->tgid = NULL; + mf->ppid = NULL; + return -EINVAL; + } + + mf->valid |= KDBUS_ATTACH_PIDS; + } + + if (seclabel) { + mf->seclabel = kstrdup(seclabel, GFP_KERNEL); + if (!mf->seclabel) + return -ENOMEM; + + mf->valid |= KDBUS_ATTACH_SECLABEL; + } + + return 0; +} + +/** + * kdbus_meta_conn_new() - Create connection metadata object + * + * Return: Pointer to new object on success, ERR_PTR on failure. + */ +struct kdbus_meta_conn *kdbus_meta_conn_new(void) +{ + struct kdbus_meta_conn *mc; + + mc = kzalloc(sizeof(*mc), GFP_KERNEL); + if (!mc) + return ERR_PTR(-ENOMEM); + + kref_init(&mc->kref); + mutex_init(&mc->lock); + + return mc; +} + +static void kdbus_meta_conn_free(struct kref *kref) +{ + struct kdbus_meta_conn *mc = + container_of(kref, struct kdbus_meta_conn, kref); + + kfree(mc->conn_description); + kfree(mc->owned_names_items); + kfree(mc); +} + +/** + * kdbus_meta_conn_ref() - Gain reference + * @mc: Connection metadata object + */ +struct kdbus_meta_conn *kdbus_meta_conn_ref(struct kdbus_meta_conn *mc) +{ + if (mc) + kref_get(&mc->kref); + return mc; +} + +/** + * kdbus_meta_conn_unref() - Drop reference + * @mc: Connection metadata object + */ +struct kdbus_meta_conn *kdbus_meta_conn_unref(struct kdbus_meta_conn *mc) +{ + if (mc) + kref_put(&mc->kref, kdbus_meta_conn_free); + return NULL; +} + +static void kdbus_meta_conn_collect_timestamp(struct kdbus_meta_conn *mc, + u64 msg_seqnum) +{ + mc->ts.monotonic_ns = ktime_get_ns(); + mc->ts.realtime_ns = ktime_get_real_ns(); + + if (msg_seqnum) + mc->ts.seqnum = msg_seqnum; + + mc->valid |= KDBUS_ATTACH_TIMESTAMP; +} + +static int kdbus_meta_conn_collect_names(struct kdbus_meta_conn *mc, + struct kdbus_conn *conn) +{ + const struct kdbus_name_owner *owner; + struct kdbus_item *item; + size_t slen, size; + + lockdep_assert_held(&conn->ep->bus->name_registry->rwlock); + + size = 0; + /* open-code length calculation to avoid final padding */ + list_for_each_entry(owner, &conn->names_list, conn_entry) + if (!(owner->flags & KDBUS_NAME_IN_QUEUE)) + size = KDBUS_ALIGN8(size) + KDBUS_ITEM_HEADER_SIZE + + sizeof(struct kdbus_name) + + strlen(owner->name->name) + 1; + + if (!size) + return 0; + + /* make sure we include zeroed padding for convenience helpers */ + item = kmalloc(KDBUS_ALIGN8(size), GFP_KERNEL); + if (!item) + return -ENOMEM; + + mc->owned_names_items = item; + mc->owned_names_size = size; + + list_for_each_entry(owner, &conn->names_list, conn_entry) { + if (owner->flags & KDBUS_NAME_IN_QUEUE) + continue; + + slen = strlen(owner->name->name) + 1; + kdbus_item_set(item, KDBUS_ITEM_OWNED_NAME, NULL, + sizeof(struct kdbus_name) + slen); + item->name.flags = owner->flags; + memcpy(item->name.name, owner->name->name, slen); + item = KDBUS_ITEM_NEXT(item); + } + + /* sanity check: the buffer should be completely written now */ + WARN_ON((u8 *)item != + (u8 *)mc->owned_names_items + KDBUS_ALIGN8(size)); + + mc->valid |= KDBUS_ATTACH_NAMES; + return 0; +} + +static int kdbus_meta_conn_collect_description(struct kdbus_meta_conn *mc, + struct kdbus_conn *conn) +{ + if (!conn->description) + return 0; + + mc->conn_description = kstrdup(conn->description, GFP_KERNEL); + if (!mc->conn_description) + return -ENOMEM; + + mc->valid |= KDBUS_ATTACH_CONN_DESCRIPTION; + return 0; +} + +/** + * kdbus_meta_conn_collect() - Collect connection metadata + * @mc: Message metadata object + * @conn: Connection to collect data from + * @msg_seqnum: Sequence number of the message to send + * @what: Attach flags to collect + * + * This collects connection metadata from @msg_seqnum and @conn and saves it + * in @mc. + * + * If KDBUS_ATTACH_NAMES is set in @what and @conn is non-NULL, the caller must + * hold the name-registry read-lock of conn->ep->bus->registry. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_meta_conn_collect(struct kdbus_meta_conn *mc, + struct kdbus_conn *conn, + u64 msg_seqnum, u64 what) +{ + int ret; + + if (!mc || !(what & (KDBUS_ATTACH_TIMESTAMP | + KDBUS_ATTACH_NAMES | + KDBUS_ATTACH_CONN_DESCRIPTION))) + return 0; + + mutex_lock(&mc->lock); + + if (msg_seqnum && (what & KDBUS_ATTACH_TIMESTAMP) && + !(mc->collected & KDBUS_ATTACH_TIMESTAMP)) { + kdbus_meta_conn_collect_timestamp(mc, msg_seqnum); + mc->collected |= KDBUS_ATTACH_TIMESTAMP; + } + + if (conn && (what & KDBUS_ATTACH_NAMES) && + !(mc->collected & KDBUS_ATTACH_NAMES)) { + ret = kdbus_meta_conn_collect_names(mc, conn); + if (ret < 0) + goto exit_unlock; + mc->collected |= KDBUS_ATTACH_NAMES; + } + + if (conn && (what & KDBUS_ATTACH_CONN_DESCRIPTION) && + !(mc->collected & KDBUS_ATTACH_CONN_DESCRIPTION)) { + ret = kdbus_meta_conn_collect_description(mc, conn); + if (ret < 0) + goto exit_unlock; + mc->collected |= KDBUS_ATTACH_CONN_DESCRIPTION; + } + + ret = 0; + +exit_unlock: + mutex_unlock(&mc->lock); + return ret; +} + +static void kdbus_meta_export_caps(struct kdbus_meta_caps *out, + const struct kdbus_meta_proc *mp, + struct user_namespace *user_ns) +{ + struct user_namespace *iter; + const struct cred *cred = mp->cred; + bool parent = false, owner = false; + int i; + + /* + * This translates the effective capabilities of 'cred' into the given + * user-namespace. If the given user-namespace is a child-namespace of + * the user-namespace of 'cred', the mask can be copied verbatim. If + * not, the mask is cleared. + * There's one exception: If 'cred' is the owner of any user-namespace + * in the path between the given user-namespace and the user-namespace + * of 'cred', then it has all effective capabilities set. This means, + * the user who created a user-namespace always has all effective + * capabilities in any child namespaces. Note that this is based on the + * uid of the namespace creator, not the task hierarchy. + */ + for (iter = user_ns; iter; iter = iter->parent) { + if (iter == cred->user_ns) { + parent = true; + break; + } + + if (iter == &init_user_ns) + break; + + if ((iter->parent == cred->user_ns) && + uid_eq(iter->owner, cred->euid)) { + owner = true; + break; + } + } + + out->last_cap = CAP_LAST_CAP; + + CAP_FOR_EACH_U32(i) { + if (parent) { + out->set[0].caps[i] = cred->cap_inheritable.cap[i]; + out->set[1].caps[i] = cred->cap_permitted.cap[i]; + out->set[2].caps[i] = cred->cap_effective.cap[i]; + out->set[3].caps[i] = cred->cap_bset.cap[i]; + } else if (owner) { + out->set[0].caps[i] = 0U; + out->set[1].caps[i] = ~0U; + out->set[2].caps[i] = ~0U; + out->set[3].caps[i] = ~0U; + } else { + out->set[0].caps[i] = 0U; + out->set[1].caps[i] = 0U; + out->set[2].caps[i] = 0U; + out->set[3].caps[i] = 0U; + } + } + + /* clear unused bits */ + for (i = 0; i < 4; i++) + out->set[i].caps[CAP_TO_INDEX(CAP_LAST_CAP)] &= + CAP_LAST_U32_VALID_MASK; +} + +/* This is equivalent to from_kuid_munged(), but maps INVALID_UID to itself */ +static uid_t kdbus_from_kuid_keep(struct user_namespace *ns, kuid_t uid) +{ + return uid_valid(uid) ? from_kuid_munged(ns, uid) : ((uid_t)-1); +} + +/* This is equivalent to from_kgid_munged(), but maps INVALID_GID to itself */ +static gid_t kdbus_from_kgid_keep(struct user_namespace *ns, kgid_t gid) +{ + return gid_valid(gid) ? from_kgid_munged(ns, gid) : ((gid_t)-1); +} + +struct kdbus_meta_staging { + const struct kdbus_meta_proc *mp; + const struct kdbus_meta_fake *mf; + const struct kdbus_meta_conn *mc; + const struct kdbus_conn *conn; + u64 mask; + + void *exe; + const char *exe_path; +}; + +static size_t kdbus_meta_measure(struct kdbus_meta_staging *staging) +{ + const struct kdbus_meta_proc *mp = staging->mp; + const struct kdbus_meta_fake *mf = staging->mf; + const struct kdbus_meta_conn *mc = staging->mc; + const u64 mask = staging->mask; + size_t size = 0; + + /* process metadata */ + + if (mf && (mask & KDBUS_ATTACH_CREDS)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_creds)); + else if (mp && (mask & KDBUS_ATTACH_CREDS)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_creds)); + + if (mf && (mask & KDBUS_ATTACH_PIDS)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_pids)); + else if (mp && (mask & KDBUS_ATTACH_PIDS)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_pids)); + + if (mp && (mask & KDBUS_ATTACH_AUXGROUPS)) + size += KDBUS_ITEM_SIZE(mp->cred->group_info->ngroups * + sizeof(u64)); + + if (mp && (mask & KDBUS_ATTACH_TID_COMM)) + size += KDBUS_ITEM_SIZE(strlen(mp->tid_comm) + 1); + + if (mp && (mask & KDBUS_ATTACH_PID_COMM)) + size += KDBUS_ITEM_SIZE(strlen(mp->pid_comm) + 1); + + if (staging->exe_path && (mask & KDBUS_ATTACH_EXE)) + size += KDBUS_ITEM_SIZE(strlen(staging->exe_path) + 1); + + if (mp && (mask & KDBUS_ATTACH_CMDLINE)) + size += KDBUS_ITEM_SIZE(strlen(mp->cmdline) + 1); + + if (mp && (mask & KDBUS_ATTACH_CGROUP)) + size += KDBUS_ITEM_SIZE(strlen(mp->cgroup) + 1); + + if (mp && (mask & KDBUS_ATTACH_CAPS)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_meta_caps)); + + if (mf && (mask & KDBUS_ATTACH_SECLABEL)) + size += KDBUS_ITEM_SIZE(strlen(mf->seclabel) + 1); + else if (mp && (mask & KDBUS_ATTACH_SECLABEL)) + size += KDBUS_ITEM_SIZE(strlen(mp->seclabel) + 1); + + if (mp && (mask & KDBUS_ATTACH_AUDIT)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_audit)); + + /* connection metadata */ + + if (mc && (mask & KDBUS_ATTACH_NAMES)) + size += KDBUS_ALIGN8(mc->owned_names_size); + + if (mc && (mask & KDBUS_ATTACH_CONN_DESCRIPTION)) + size += KDBUS_ITEM_SIZE(strlen(mc->conn_description) + 1); + + if (mc && (mask & KDBUS_ATTACH_TIMESTAMP)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_timestamp)); + + return size; +} + +static struct kdbus_item *kdbus_write_head(struct kdbus_item **iter, + u64 type, u64 size) +{ + struct kdbus_item *item = *iter; + size_t padding; + + item->type = type; + item->size = KDBUS_ITEM_HEADER_SIZE + size; + + /* clear padding */ + padding = KDBUS_ALIGN8(item->size) - item->size; + if (padding) + memset(item->data + size, 0, padding); + + *iter = KDBUS_ITEM_NEXT(item); + return item; +} + +static struct kdbus_item *kdbus_write_full(struct kdbus_item **iter, + u64 type, u64 size, const void *data) +{ + struct kdbus_item *item; + + item = kdbus_write_head(iter, type, size); + memcpy(item->data, data, size); + return item; +} + +static size_t kdbus_meta_write(struct kdbus_meta_staging *staging, void *mem, + size_t size) +{ + struct user_namespace *user_ns = staging->conn->cred->user_ns; + struct pid_namespace *pid_ns = ns_of_pid(staging->conn->pid); + struct kdbus_item *item = NULL, *items = mem; + u8 *end, *owned_names_end = NULL; + + /* process metadata */ + + if (staging->mf && (staging->mask & KDBUS_ATTACH_CREDS)) { + const struct kdbus_meta_fake *mf = staging->mf; + + item = kdbus_write_head(&items, KDBUS_ITEM_CREDS, + sizeof(struct kdbus_creds)); + item->creds = (struct kdbus_creds){ + .uid = kdbus_from_kuid_keep(user_ns, mf->uid), + .euid = kdbus_from_kuid_keep(user_ns, mf->euid), + .suid = kdbus_from_kuid_keep(user_ns, mf->suid), + .fsuid = kdbus_from_kuid_keep(user_ns, mf->fsuid), + .gid = kdbus_from_kgid_keep(user_ns, mf->gid), + .egid = kdbus_from_kgid_keep(user_ns, mf->egid), + .sgid = kdbus_from_kgid_keep(user_ns, mf->sgid), + .fsgid = kdbus_from_kgid_keep(user_ns, mf->fsgid), + }; + } else if (staging->mp && (staging->mask & KDBUS_ATTACH_CREDS)) { + const struct cred *c = staging->mp->cred; + + item = kdbus_write_head(&items, KDBUS_ITEM_CREDS, + sizeof(struct kdbus_creds)); + item->creds = (struct kdbus_creds){ + .uid = kdbus_from_kuid_keep(user_ns, c->uid), + .euid = kdbus_from_kuid_keep(user_ns, c->euid), + .suid = kdbus_from_kuid_keep(user_ns, c->suid), + .fsuid = kdbus_from_kuid_keep(user_ns, c->fsuid), + .gid = kdbus_from_kgid_keep(user_ns, c->gid), + .egid = kdbus_from_kgid_keep(user_ns, c->egid), + .sgid = kdbus_from_kgid_keep(user_ns, c->sgid), + .fsgid = kdbus_from_kgid_keep(user_ns, c->fsgid), + }; + } + + if (staging->mf && (staging->mask & KDBUS_ATTACH_PIDS)) { + item = kdbus_write_head(&items, KDBUS_ITEM_PIDS, + sizeof(struct kdbus_pids)); + item->pids = (struct kdbus_pids){ + .pid = pid_nr_ns(staging->mf->tgid, pid_ns), + .tid = pid_nr_ns(staging->mf->pid, pid_ns), + .ppid = pid_nr_ns(staging->mf->ppid, pid_ns), + }; + } else if (staging->mp && (staging->mask & KDBUS_ATTACH_PIDS)) { + item = kdbus_write_head(&items, KDBUS_ITEM_PIDS, + sizeof(struct kdbus_pids)); + item->pids = (struct kdbus_pids){ + .pid = pid_nr_ns(staging->mp->tgid, pid_ns), + .tid = pid_nr_ns(staging->mp->pid, pid_ns), + .ppid = pid_nr_ns(staging->mp->ppid, pid_ns), + }; + } + + if (staging->mp && (staging->mask & KDBUS_ATTACH_AUXGROUPS)) { + const struct group_info *info = staging->mp->cred->group_info; + size_t i; + + item = kdbus_write_head(&items, KDBUS_ITEM_AUXGROUPS, + info->ngroups * sizeof(u64)); + for (i = 0; i < info->ngroups; ++i) + item->data64[i] = from_kgid_munged(user_ns, + GROUP_AT(info, i)); + } + + if (staging->mp && (staging->mask & KDBUS_ATTACH_TID_COMM)) + item = kdbus_write_full(&items, KDBUS_ITEM_TID_COMM, + strlen(staging->mp->tid_comm) + 1, + staging->mp->tid_comm); + + if (staging->mp && (staging->mask & KDBUS_ATTACH_PID_COMM)) + item = kdbus_write_full(&items, KDBUS_ITEM_PID_COMM, + strlen(staging->mp->pid_comm) + 1, + staging->mp->pid_comm); + + if (staging->exe_path && (staging->mask & KDBUS_ATTACH_EXE)) + item = kdbus_write_full(&items, KDBUS_ITEM_EXE, + strlen(staging->exe_path) + 1, + staging->exe_path); + + if (staging->mp && (staging->mask & KDBUS_ATTACH_CMDLINE)) + item = kdbus_write_full(&items, KDBUS_ITEM_CMDLINE, + strlen(staging->mp->cmdline) + 1, + staging->mp->cmdline); + + if (staging->mp && (staging->mask & KDBUS_ATTACH_CGROUP)) + item = kdbus_write_full(&items, KDBUS_ITEM_CGROUP, + strlen(staging->mp->cgroup) + 1, + staging->mp->cgroup); + + if (staging->mp && (staging->mask & KDBUS_ATTACH_CAPS)) { + item = kdbus_write_head(&items, KDBUS_ITEM_CAPS, + sizeof(struct kdbus_meta_caps)); + kdbus_meta_export_caps((void*)&item->caps, staging->mp, + user_ns); + } + + if (staging->mf && (staging->mask & KDBUS_ATTACH_SECLABEL)) + item = kdbus_write_full(&items, KDBUS_ITEM_SECLABEL, + strlen(staging->mf->seclabel) + 1, + staging->mf->seclabel); + else if (staging->mp && (staging->mask & KDBUS_ATTACH_SECLABEL)) + item = kdbus_write_full(&items, KDBUS_ITEM_SECLABEL, + strlen(staging->mp->seclabel) + 1, + staging->mp->seclabel); + + if (staging->mp && (staging->mask & KDBUS_ATTACH_AUDIT)) { + item = kdbus_write_head(&items, KDBUS_ITEM_AUDIT, + sizeof(struct kdbus_audit)); + item->audit = (struct kdbus_audit){ + .loginuid = from_kuid(user_ns, + staging->mp->audit_loginuid), + .sessionid = staging->mp->audit_sessionid, + }; + } + + /* connection metadata */ + + if (staging->mc && (staging->mask & KDBUS_ATTACH_NAMES)) { + memcpy(items, staging->mc->owned_names_items, + KDBUS_ALIGN8(staging->mc->owned_names_size)); + owned_names_end = (u8 *)items + staging->mc->owned_names_size; + items = (void *)KDBUS_ALIGN8((unsigned long)owned_names_end); + } + + if (staging->mc && (staging->mask & KDBUS_ATTACH_CONN_DESCRIPTION)) + item = kdbus_write_full(&items, KDBUS_ITEM_CONN_DESCRIPTION, + strlen(staging->mc->conn_description) + 1, + staging->mc->conn_description); + + if (staging->mc && (staging->mask & KDBUS_ATTACH_TIMESTAMP)) + item = kdbus_write_full(&items, KDBUS_ITEM_TIMESTAMP, + sizeof(staging->mc->ts), + &staging->mc->ts); + + /* + * Return real size (minus trailing padding). In case of 'owned_names' + * we cannot deduce it from item->size, so treat it special. + */ + + if (items == (void *)KDBUS_ALIGN8((unsigned long)owned_names_end)) + end = owned_names_end; + else if (item) + end = (u8 *)item + item->size; + else + end = mem; + + WARN_ON((u8 *)items - (u8 *)mem != size); + WARN_ON((void *)KDBUS_ALIGN8((unsigned long)end) != (void *)items); + + return end - (u8 *)mem; +} + +int kdbus_meta_emit(struct kdbus_meta_proc *mp, + struct kdbus_meta_fake *mf, + struct kdbus_meta_conn *mc, + struct kdbus_conn *conn, + u64 mask, + struct kdbus_item **out_items, + size_t *out_size) +{ + struct kdbus_meta_staging staging = {}; + struct kdbus_item *items = NULL; + size_t size = 0; + int ret; + + if (WARN_ON(mf && mp)) + mp = NULL; + + staging.mp = mp; + staging.mf = mf; + staging.mc = mc; + staging.conn = conn; + + /* get mask of valid items */ + if (mf) + staging.mask |= mf->valid; + if (mp) { + mutex_lock(&mp->lock); + staging.mask |= mp->valid; + mutex_unlock(&mp->lock); + } + if (mc) { + mutex_lock(&mc->lock); + staging.mask |= mc->valid; + mutex_unlock(&mc->lock); + } + + staging.mask &= mask; + + if (!staging.mask) { /* bail out if nothing to do */ + ret = 0; + goto exit; + } + + /* EXE is special as it needs a temporary page to assemble */ + if (mp && (staging.mask & KDBUS_ATTACH_EXE)) { + struct path p; + + /* + * XXX: We need access to __d_path() so we can write the path + * relative to conn->root_path. Once upstream, we need + * EXPORT_SYMBOL(__d_path) or an equivalent of d_path() that + * takes the root path directly. Until then, we drop this item + * if the root-paths differ. + */ + + get_fs_root(current->fs, &p); + if (path_equal(&p, &conn->root_path)) { + staging.exe = (void *)__get_free_page(GFP_TEMPORARY); + if (!staging.exe) { + path_put(&p); + ret = -ENOMEM; + goto exit; + } + + staging.exe_path = d_path(&mp->exe_path, staging.exe, + PAGE_SIZE); + if (IS_ERR(staging.exe_path)) { + path_put(&p); + ret = PTR_ERR(staging.exe_path); + goto exit; + } + } + path_put(&p); + } + + size = kdbus_meta_measure(&staging); + if (!size) { /* bail out if nothing to do */ + ret = 0; + goto exit; + } + + items = kmalloc(size, GFP_KERNEL); + if (!items) { + ret = -ENOMEM; + goto exit; + } + + size = kdbus_meta_write(&staging, items, size); + if (!size) { + kfree(items); + items = NULL; + } + + ret = 0; + +exit: + if (staging.exe) + free_page((unsigned long)staging.exe); + if (ret >= 0) { + *out_items = items; + *out_size = size; + } + return ret; +} + +enum { + KDBUS_META_PROC_NONE, + KDBUS_META_PROC_NORMAL, +}; + +/** + * kdbus_proc_permission() - check /proc permissions on target pid + * @pid_ns: namespace we operate in + * @cred: credentials of requestor + * @target: target process + * + * This checks whether a process with credentials @cred can access information + * of @target in the namespace @pid_ns. This tries to follow /proc permissions, + * but is slightly more restrictive. + * + * Return: The /proc access level (KDBUS_META_PROC_*) is returned. + */ +static unsigned int kdbus_proc_permission(const struct pid_namespace *pid_ns, + const struct cred *cred, + struct pid *target) +{ + if (pid_ns->hide_pid < 1) + return KDBUS_META_PROC_NORMAL; + + /* XXX: we need groups_search() exported for aux-groups */ + if (gid_eq(cred->egid, pid_ns->pid_gid)) + return KDBUS_META_PROC_NORMAL; + + /* + * XXX: If ptrace_may_access(PTRACE_MODE_READ) is granted, you can + * overwrite hide_pid. However, ptrace_may_access() only supports + * checking 'current', hence, we cannot use this here. But we + * simply decide to not support this override, so no need to worry. + */ + + return KDBUS_META_PROC_NONE; +} + +/** + * kdbus_meta_proc_mask() - calculate which metadata would be visible to + * a connection via /proc + * @prv_pid: pid of metadata provider + * @req_pid: pid of metadata requestor + * @req_cred: credentials of metadata reqeuestor + * @wanted: metadata that is requested + * + * This checks which metadata items of @prv_pid can be read via /proc by the + * requestor @req_pid. + * + * Return: Set of metadata flags the requestor can see (limited by @wanted). + */ +static u64 kdbus_meta_proc_mask(struct pid *prv_pid, + struct pid *req_pid, + const struct cred *req_cred, + u64 wanted) +{ + struct pid_namespace *prv_ns, *req_ns; + unsigned int proc; + + prv_ns = ns_of_pid(prv_pid); + req_ns = ns_of_pid(req_pid); + + /* + * If the sender is not visible in the receiver namespace, then the + * receiver cannot access the sender via its own procfs. Hence, we do + * not attach any additional metadata. + */ + if (!pid_nr_ns(prv_pid, req_ns)) + return 0; + + /* + * If the pid-namespace of the receiver has hide_pid set, it cannot see + * any process but its own. We shortcut this /proc permission check if + * provider and requestor are the same. If not, we perform rather + * expensive /proc permission checks. + */ + if (prv_pid == req_pid) + proc = KDBUS_META_PROC_NORMAL; + else + proc = kdbus_proc_permission(req_ns, req_cred, prv_pid); + + /* you need /proc access to read standard process attributes */ + if (proc < KDBUS_META_PROC_NORMAL) + wanted &= ~(KDBUS_ATTACH_TID_COMM | + KDBUS_ATTACH_PID_COMM | + KDBUS_ATTACH_SECLABEL | + KDBUS_ATTACH_CMDLINE | + KDBUS_ATTACH_CGROUP | + KDBUS_ATTACH_AUDIT | + KDBUS_ATTACH_CAPS | + KDBUS_ATTACH_EXE); + + /* clear all non-/proc flags */ + return wanted & (KDBUS_ATTACH_TID_COMM | + KDBUS_ATTACH_PID_COMM | + KDBUS_ATTACH_SECLABEL | + KDBUS_ATTACH_CMDLINE | + KDBUS_ATTACH_CGROUP | + KDBUS_ATTACH_AUDIT | + KDBUS_ATTACH_CAPS | + KDBUS_ATTACH_EXE); +} + +/** + * kdbus_meta_get_mask() - calculate attach flags mask for metadata request + * @prv_pid: pid of metadata provider + * @prv_mask: mask of metadata the provide grants unchecked + * @req_pid: pid of metadata requestor + * @req_cred: credentials of metadata requestor + * @req_mask: mask of metadata that is requested + * + * This calculates the metadata items that the requestor @req_pid can access + * from the metadata provider @prv_pid. This permission check consists of + * several different parts: + * - Providers can grant metadata items unchecked. Regardless of their type, + * they're always granted to the requestor. This mask is passed as @prv_mask. + * - Basic items (credentials and connection metadata) are granted implicitly + * to everyone. They're publicly available to any bus-user that can see the + * provider. + * - Process credentials that are not granted implicitly follow the same + * permission checks as /proc. This means, we always assume a requestor + * process has access to their *own* /proc mount, if they have access to + * kdbusfs. + * + * Return: Mask of metadata that is granted. + */ +static u64 kdbus_meta_get_mask(struct pid *prv_pid, u64 prv_mask, + struct pid *req_pid, + const struct cred *req_cred, u64 req_mask) +{ + u64 missing, impl_mask, proc_mask = 0; + + /* + * Connection metadata and basic unix process credentials are + * transmitted implicitly, and cannot be suppressed. Both are required + * to perform user-space policies on the receiver-side. Furthermore, + * connection metadata is public state, anyway, and unix credentials + * are needed for UDS-compatibility. We extend them slightly by + * auxiliary groups and additional uids/gids/pids. + */ + impl_mask = /* connection metadata */ + KDBUS_ATTACH_CONN_DESCRIPTION | + KDBUS_ATTACH_TIMESTAMP | + KDBUS_ATTACH_NAMES | + /* credentials and pids */ + KDBUS_ATTACH_AUXGROUPS | + KDBUS_ATTACH_CREDS | + KDBUS_ATTACH_PIDS; + + /* + * Calculate the set of metadata that is not granted implicitly nor by + * the sender, but still requested by the receiver. If any are left, + * perform rather expensive /proc access checks for them. + */ + missing = req_mask & ~((prv_mask | impl_mask) & req_mask); + if (missing) + proc_mask = kdbus_meta_proc_mask(prv_pid, req_pid, req_cred, + missing); + + return (prv_mask | impl_mask | proc_mask) & req_mask; +} + +/** + */ +u64 kdbus_meta_info_mask(const struct kdbus_conn *conn, u64 mask) +{ + return kdbus_meta_get_mask(conn->pid, + atomic64_read(&conn->attach_flags_send), + task_pid(current), + current_cred(), + mask); +} + +/** + */ +u64 kdbus_meta_msg_mask(const struct kdbus_conn *snd, + const struct kdbus_conn *rcv) +{ + return kdbus_meta_get_mask(task_pid(current), + atomic64_read(&snd->attach_flags_send), + rcv->pid, + rcv->cred, + atomic64_read(&rcv->attach_flags_recv)); +} diff -Nur linux-4.2/ipc/kdbus/metadata.h linux-4.2-pck/ipc/kdbus/metadata.h --- linux-4.2/ipc/kdbus/metadata.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/metadata.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_METADATA_H +#define __KDBUS_METADATA_H + +#include + +struct kdbus_conn; +struct kdbus_pool_slice; + +struct kdbus_meta_proc; +struct kdbus_meta_conn; + +/** + * struct kdbus_meta_fake - Fake metadata + * @valid: Bitmask of collected and valid items + * @uid: UID of process + * @euid: EUID of process + * @suid: SUID of process + * @fsuid: FSUID of process + * @gid: GID of process + * @egid: EGID of process + * @sgid: SGID of process + * @fsgid: FSGID of process + * @pid: PID of process + * @tgid: TGID of process + * @ppid: PPID of process + * @seclabel: Seclabel + */ +struct kdbus_meta_fake { + u64 valid; + + /* KDBUS_ITEM_CREDS */ + kuid_t uid, euid, suid, fsuid; + kgid_t gid, egid, sgid, fsgid; + + /* KDBUS_ITEM_PIDS */ + struct pid *pid, *tgid, *ppid; + + /* KDBUS_ITEM_SECLABEL */ + char *seclabel; +}; + +struct kdbus_meta_proc *kdbus_meta_proc_new(void); +struct kdbus_meta_proc *kdbus_meta_proc_ref(struct kdbus_meta_proc *mp); +struct kdbus_meta_proc *kdbus_meta_proc_unref(struct kdbus_meta_proc *mp); +int kdbus_meta_proc_collect(struct kdbus_meta_proc *mp, u64 what); + +struct kdbus_meta_fake *kdbus_meta_fake_new(void); +struct kdbus_meta_fake *kdbus_meta_fake_free(struct kdbus_meta_fake *mf); +int kdbus_meta_fake_collect(struct kdbus_meta_fake *mf, + const struct kdbus_creds *creds, + const struct kdbus_pids *pids, + const char *seclabel); + +struct kdbus_meta_conn *kdbus_meta_conn_new(void); +struct kdbus_meta_conn *kdbus_meta_conn_ref(struct kdbus_meta_conn *mc); +struct kdbus_meta_conn *kdbus_meta_conn_unref(struct kdbus_meta_conn *mc); +int kdbus_meta_conn_collect(struct kdbus_meta_conn *mc, + struct kdbus_conn *conn, + u64 msg_seqnum, u64 what); + +int kdbus_meta_emit(struct kdbus_meta_proc *mp, + struct kdbus_meta_fake *mf, + struct kdbus_meta_conn *mc, + struct kdbus_conn *conn, + u64 mask, + struct kdbus_item **out_items, + size_t *out_size); +u64 kdbus_meta_info_mask(const struct kdbus_conn *conn, u64 mask); +u64 kdbus_meta_msg_mask(const struct kdbus_conn *snd, + const struct kdbus_conn *rcv); + +#endif diff -Nur linux-4.2/ipc/kdbus/names.c linux-4.2-pck/ipc/kdbus/names.c --- linux-4.2/ipc/kdbus/names.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/names.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,854 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "handle.h" +#include "item.h" +#include "names.h" +#include "notify.h" +#include "policy.h" + +#define KDBUS_NAME_SAVED_MASK (KDBUS_NAME_ALLOW_REPLACEMENT | \ + KDBUS_NAME_QUEUE) + +static bool kdbus_name_owner_is_used(struct kdbus_name_owner *owner) +{ + return !list_empty(&owner->name_entry) || + owner == owner->name->activator; +} + +static struct kdbus_name_owner * +kdbus_name_owner_new(struct kdbus_conn *conn, struct kdbus_name_entry *name, + u64 flags) +{ + struct kdbus_name_owner *owner; + + kdbus_conn_assert_active(conn); + + if (conn->name_count >= KDBUS_CONN_MAX_NAMES) + return ERR_PTR(-E2BIG); + + owner = kmalloc(sizeof(*owner), GFP_KERNEL); + if (!owner) + return ERR_PTR(-ENOMEM); + + owner->flags = flags & KDBUS_NAME_SAVED_MASK; + owner->conn = conn; + owner->name = name; + list_add_tail(&owner->conn_entry, &conn->names_list); + INIT_LIST_HEAD(&owner->name_entry); + + ++conn->name_count; + return owner; +} + +static void kdbus_name_owner_free(struct kdbus_name_owner *owner) +{ + if (!owner) + return; + + WARN_ON(kdbus_name_owner_is_used(owner)); + --owner->conn->name_count; + list_del(&owner->conn_entry); + kfree(owner); +} + +static struct kdbus_name_owner * +kdbus_name_owner_find(struct kdbus_name_entry *name, struct kdbus_conn *conn) +{ + struct kdbus_name_owner *owner; + + /* + * Use conn->names_list over name->queue to make sure boundaries of + * this linear search are controlled by the connection itself. + * Furthermore, this will find normal owners as well as activators + * without any additional code. + */ + list_for_each_entry(owner, &conn->names_list, conn_entry) + if (owner->name == name) + return owner; + + return NULL; +} + +static bool kdbus_name_entry_is_used(struct kdbus_name_entry *name) +{ + return !list_empty(&name->queue) || name->activator; +} + +static struct kdbus_name_owner * +kdbus_name_entry_first(struct kdbus_name_entry *name) +{ + return list_first_entry_or_null(&name->queue, struct kdbus_name_owner, + name_entry); +} + +static struct kdbus_name_entry * +kdbus_name_entry_new(struct kdbus_name_registry *r, u32 hash, + const char *name_str) +{ + struct kdbus_name_entry *name; + size_t namelen; + + lockdep_assert_held(&r->rwlock); + + namelen = strlen(name_str); + + name = kmalloc(sizeof(*name) + namelen + 1, GFP_KERNEL); + if (!name) + return ERR_PTR(-ENOMEM); + + name->name_id = ++r->name_seq_last; + name->activator = NULL; + INIT_LIST_HEAD(&name->queue); + hash_add(r->entries_hash, &name->hentry, hash); + memcpy(name->name, name_str, namelen + 1); + + return name; +} + +static void kdbus_name_entry_free(struct kdbus_name_entry *name) +{ + if (!name) + return; + + WARN_ON(kdbus_name_entry_is_used(name)); + hash_del(&name->hentry); + kfree(name); +} + +static struct kdbus_name_entry * +kdbus_name_entry_find(struct kdbus_name_registry *r, u32 hash, + const char *name_str) +{ + struct kdbus_name_entry *name; + + lockdep_assert_held(&r->rwlock); + + hash_for_each_possible(r->entries_hash, name, hentry, hash) + if (!strcmp(name->name, name_str)) + return name; + + return NULL; +} + +/** + * kdbus_name_registry_new() - create a new name registry + * + * Return: a new kdbus_name_registry on success, ERR_PTR on failure. + */ +struct kdbus_name_registry *kdbus_name_registry_new(void) +{ + struct kdbus_name_registry *r; + + r = kmalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return ERR_PTR(-ENOMEM); + + hash_init(r->entries_hash); + init_rwsem(&r->rwlock); + r->name_seq_last = 0; + + return r; +} + +/** + * kdbus_name_registry_free() - free name registry + * @r: name registry to free, or NULL + * + * Free a name registry and cleanup all internal objects. This is a no-op if + * you pass NULL as registry. + */ +void kdbus_name_registry_free(struct kdbus_name_registry *r) +{ + if (!r) + return; + + WARN_ON(!hash_empty(r->entries_hash)); + kfree(r); +} + +/** + * kdbus_name_lookup_unlocked() - lookup name in registry + * @reg: name registry + * @name: name to lookup + * + * This looks up @name in the given name-registry and returns the + * kdbus_name_entry object. The caller must hold the registry-lock and must not + * access the returned object after releasing the lock. + * + * Return: Pointer to name-entry, or NULL if not found. + */ +struct kdbus_name_entry * +kdbus_name_lookup_unlocked(struct kdbus_name_registry *reg, const char *name) +{ + return kdbus_name_entry_find(reg, kdbus_strhash(name), name); +} + +static int kdbus_name_become_activator(struct kdbus_name_owner *owner, + u64 *return_flags) +{ + if (kdbus_name_owner_is_used(owner)) + return -EALREADY; + if (owner->name->activator) + return -EEXIST; + + owner->name->activator = owner; + owner->flags |= KDBUS_NAME_ACTIVATOR; + + if (kdbus_name_entry_first(owner->name)) { + owner->flags |= KDBUS_NAME_IN_QUEUE; + } else { + owner->flags |= KDBUS_NAME_PRIMARY; + kdbus_notify_name_change(owner->conn->ep->bus, + KDBUS_ITEM_NAME_ADD, + 0, owner->conn->id, + 0, owner->flags, + owner->name->name); + } + + if (return_flags) + *return_flags = owner->flags | KDBUS_NAME_ACQUIRED; + + return 0; +} + +static int kdbus_name_update(struct kdbus_name_owner *owner, u64 flags, + u64 *return_flags) +{ + struct kdbus_name_owner *primary, *activator; + struct kdbus_name_entry *name; + struct kdbus_bus *bus; + u64 nflags = 0; + int ret = 0; + + name = owner->name; + bus = owner->conn->ep->bus; + primary = kdbus_name_entry_first(name); + activator = name->activator; + + /* cannot be activator and acquire a name */ + if (owner == activator) + return -EUCLEAN; + + /* update saved flags */ + owner->flags = flags & KDBUS_NAME_SAVED_MASK; + + if (!primary) { + /* + * No primary owner (but maybe an activator). Take over the + * name. + */ + + list_add(&owner->name_entry, &name->queue); + owner->flags |= KDBUS_NAME_PRIMARY; + nflags |= KDBUS_NAME_ACQUIRED; + + /* move messages to new owner on activation */ + if (activator) { + kdbus_conn_move_messages(owner->conn, activator->conn, + name->name_id); + kdbus_notify_name_change(bus, KDBUS_ITEM_NAME_CHANGE, + activator->conn->id, owner->conn->id, + activator->flags, owner->flags, + name->name); + activator->flags &= ~KDBUS_NAME_PRIMARY; + activator->flags |= KDBUS_NAME_IN_QUEUE; + } else { + kdbus_notify_name_change(bus, KDBUS_ITEM_NAME_ADD, + 0, owner->conn->id, + 0, owner->flags, + name->name); + } + + } else if (owner == primary) { + /* + * Already the primary owner of the name, flags were already + * updated. Nothing to do. + */ + + owner->flags |= KDBUS_NAME_PRIMARY; + + } else if ((primary->flags & KDBUS_NAME_ALLOW_REPLACEMENT) && + (flags & KDBUS_NAME_REPLACE_EXISTING)) { + /* + * We're not the primary owner but can replace it. Move us + * ahead of the primary owner and acquire the name (possibly + * skipping queued owners ahead of us). + */ + + list_del_init(&owner->name_entry); + list_add(&owner->name_entry, &name->queue); + owner->flags |= KDBUS_NAME_PRIMARY; + nflags |= KDBUS_NAME_ACQUIRED; + + kdbus_notify_name_change(bus, KDBUS_ITEM_NAME_CHANGE, + primary->conn->id, owner->conn->id, + primary->flags, owner->flags, + name->name); + + /* requeue old primary, or drop if queueing not wanted */ + if (primary->flags & KDBUS_NAME_QUEUE) { + primary->flags &= ~KDBUS_NAME_PRIMARY; + primary->flags |= KDBUS_NAME_IN_QUEUE; + } else { + list_del_init(&primary->name_entry); + kdbus_name_owner_free(primary); + } + + } else if (flags & KDBUS_NAME_QUEUE) { + /* + * Name is already occupied and we cannot take it over, but + * queuing is allowed. Put us silently on the queue, if not + * already there. + */ + + owner->flags |= KDBUS_NAME_IN_QUEUE; + if (!kdbus_name_owner_is_used(owner)) { + list_add_tail(&owner->name_entry, &name->queue); + nflags |= KDBUS_NAME_ACQUIRED; + } + } else if (kdbus_name_owner_is_used(owner)) { + /* + * Already queued on name, but re-queueing was not requested. + * Make sure to unlink it from the name, the caller is + * responsible for releasing it. + */ + + list_del_init(&owner->name_entry); + } else { + /* + * Name is already claimed and queueing is not requested. + * Return error to the caller. + */ + + ret = -EEXIST; + } + + if (return_flags) + *return_flags = owner->flags | nflags; + + return ret; +} + +int kdbus_name_acquire(struct kdbus_name_registry *reg, + struct kdbus_conn *conn, const char *name_str, + u64 flags, u64 *return_flags) +{ + struct kdbus_name_entry *name = NULL; + struct kdbus_name_owner *owner = NULL; + u32 hash; + int ret; + + kdbus_conn_assert_active(conn); + + down_write(®->rwlock); + + /* + * Verify the connection has access to the name. Do this before testing + * for double-acquisitions and other errors to make sure we do not leak + * information about this name through possible custom endpoints. + */ + if (!kdbus_conn_policy_own_name(conn, current_cred(), name_str)) { + ret = -EPERM; + goto exit; + } + + /* + * Lookup the name entry. If it already exists, search for an owner + * entry as we might already own that name. If either does not exist, + * we will allocate a fresh one. + */ + hash = kdbus_strhash(name_str); + name = kdbus_name_entry_find(reg, hash, name_str); + if (name) { + owner = kdbus_name_owner_find(name, conn); + } else { + name = kdbus_name_entry_new(reg, hash, name_str); + if (IS_ERR(name)) { + ret = PTR_ERR(name); + name = NULL; + goto exit; + } + } + + /* create name owner object if not already queued */ + if (!owner) { + owner = kdbus_name_owner_new(conn, name, flags); + if (IS_ERR(owner)) { + ret = PTR_ERR(owner); + owner = NULL; + goto exit; + } + } + + if (flags & KDBUS_NAME_ACTIVATOR) + ret = kdbus_name_become_activator(owner, return_flags); + else + ret = kdbus_name_update(owner, flags, return_flags); + if (ret < 0) + goto exit; + +exit: + if (owner && !kdbus_name_owner_is_used(owner)) + kdbus_name_owner_free(owner); + if (name && !kdbus_name_entry_is_used(name)) + kdbus_name_entry_free(name); + up_write(®->rwlock); + kdbus_notify_flush(conn->ep->bus); + return ret; +} + +static void kdbus_name_release_unlocked(struct kdbus_name_owner *owner) +{ + struct kdbus_name_owner *primary, *next; + struct kdbus_name_entry *name; + + name = owner->name; + primary = kdbus_name_entry_first(name); + + list_del_init(&owner->name_entry); + if (owner == name->activator) + name->activator = NULL; + + if (!primary || owner == primary) { + next = kdbus_name_entry_first(name); + if (!next) + next = name->activator; + + if (next) { + /* hand to next in queue */ + next->flags &= ~KDBUS_NAME_IN_QUEUE; + next->flags |= KDBUS_NAME_PRIMARY; + if (next == name->activator) + kdbus_conn_move_messages(next->conn, + owner->conn, + name->name_id); + + kdbus_notify_name_change(owner->conn->ep->bus, + KDBUS_ITEM_NAME_CHANGE, + owner->conn->id, next->conn->id, + owner->flags, next->flags, + name->name); + } else { + kdbus_notify_name_change(owner->conn->ep->bus, + KDBUS_ITEM_NAME_REMOVE, + owner->conn->id, 0, + owner->flags, 0, + name->name); + } + } + + kdbus_name_owner_free(owner); + if (!kdbus_name_entry_is_used(name)) + kdbus_name_entry_free(name); +} + +static int kdbus_name_release(struct kdbus_name_registry *reg, + struct kdbus_conn *conn, + const char *name_str) +{ + struct kdbus_name_owner *owner; + struct kdbus_name_entry *name; + int ret = 0; + + down_write(®->rwlock); + name = kdbus_name_entry_find(reg, kdbus_strhash(name_str), name_str); + if (name) { + owner = kdbus_name_owner_find(name, conn); + if (owner) + kdbus_name_release_unlocked(owner); + else + ret = -EADDRINUSE; + } else { + ret = -ESRCH; + } + up_write(®->rwlock); + + kdbus_notify_flush(conn->ep->bus); + return ret; +} + +/** + * kdbus_name_release_all() - remove all name entries of a given connection + * @reg: name registry + * @conn: connection + */ +void kdbus_name_release_all(struct kdbus_name_registry *reg, + struct kdbus_conn *conn) +{ + struct kdbus_name_owner *owner; + + down_write(®->rwlock); + + while ((owner = list_first_entry_or_null(&conn->names_list, + struct kdbus_name_owner, + conn_entry))) + kdbus_name_release_unlocked(owner); + + up_write(®->rwlock); + + kdbus_notify_flush(conn->ep->bus); +} + +/** + * kdbus_name_is_valid() - check if a name is valid + * @p: The name to check + * @allow_wildcard: Whether or not to allow a wildcard name + * + * A name is valid if all of the following criterias are met: + * + * - The name has two or more elements separated by a period ('.') character. + * - All elements must contain at least one character. + * - Each element must only contain the ASCII characters "[A-Z][a-z][0-9]_-" + * and must not begin with a digit. + * - The name must not exceed KDBUS_NAME_MAX_LEN. + * - If @allow_wildcard is true, the name may end on '.*' + */ +bool kdbus_name_is_valid(const char *p, bool allow_wildcard) +{ + bool dot, found_dot = false; + const char *q; + + for (dot = true, q = p; *q; q++) { + if (*q == '.') { + if (dot) + return false; + + found_dot = true; + dot = true; + } else { + bool good; + + good = isalpha(*q) || (!dot && isdigit(*q)) || + *q == '_' || *q == '-' || + (allow_wildcard && dot && + *q == '*' && *(q + 1) == '\0'); + + if (!good) + return false; + + dot = false; + } + } + + if (q - p > KDBUS_NAME_MAX_LEN) + return false; + + if (dot) + return false; + + if (!found_dot) + return false; + + return true; +} + +/** + * kdbus_cmd_name_acquire() - handle KDBUS_CMD_NAME_ACQUIRE + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_name_acquire(struct kdbus_conn *conn, void __user *argp) +{ + const char *item_name; + struct kdbus_cmd *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_NAME, .mandatory = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_NAME_REPLACE_EXISTING | + KDBUS_NAME_ALLOW_REPLACEMENT | + KDBUS_NAME_QUEUE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + item_name = argv[1].item->str; + if (!kdbus_name_is_valid(item_name, false)) { + ret = -EINVAL; + goto exit; + } + + ret = kdbus_name_acquire(conn->ep->bus->name_registry, conn, item_name, + cmd->flags, &cmd->return_flags); + +exit: + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_name_release() - handle KDBUS_CMD_NAME_RELEASE + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_name_release(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_cmd *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_NAME, .mandatory = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + ret = kdbus_name_release(conn->ep->bus->name_registry, conn, + argv[1].item->str); + return kdbus_args_clear(&args, ret); +} + +static int kdbus_list_write(struct kdbus_conn *conn, + struct kdbus_conn *c, + struct kdbus_pool_slice *slice, + size_t *pos, + struct kdbus_name_owner *o, + bool write) +{ + struct kvec kvec[4]; + size_t cnt = 0; + int ret; + + /* info header */ + struct kdbus_info info = { + .size = 0, + .id = c->id, + .flags = c->flags, + }; + + /* fake the header of a kdbus_name item */ + struct { + u64 size; + u64 type; + u64 flags; + } h = {}; + + if (o && !kdbus_conn_policy_see_name_unlocked(conn, current_cred(), + o->name->name)) + return 0; + + kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &info.size); + + /* append name */ + if (o) { + size_t slen = strlen(o->name->name) + 1; + + h.size = offsetof(struct kdbus_item, name.name) + slen; + h.type = KDBUS_ITEM_OWNED_NAME; + h.flags = o->flags; + + kdbus_kvec_set(&kvec[cnt++], &h, sizeof(h), &info.size); + kdbus_kvec_set(&kvec[cnt++], o->name->name, slen, &info.size); + cnt += !!kdbus_kvec_pad(&kvec[cnt], &info.size); + } + + if (write) { + ret = kdbus_pool_slice_copy_kvec(slice, *pos, kvec, + cnt, info.size); + if (ret < 0) + return ret; + } + + *pos += info.size; + return 0; +} + +static int kdbus_list_all(struct kdbus_conn *conn, u64 flags, + struct kdbus_pool_slice *slice, + size_t *pos, bool write) +{ + struct kdbus_conn *c; + size_t p = *pos; + int ret, i; + + hash_for_each(conn->ep->bus->conn_hash, i, c, hentry) { + bool added = false; + + /* skip monitors */ + if (kdbus_conn_is_monitor(c)) + continue; + + /* all names the connection owns */ + if (flags & (KDBUS_LIST_NAMES | + KDBUS_LIST_ACTIVATORS | + KDBUS_LIST_QUEUED)) { + struct kdbus_name_owner *o; + + list_for_each_entry(o, &c->names_list, conn_entry) { + if (o->flags & KDBUS_NAME_ACTIVATOR) { + if (!(flags & KDBUS_LIST_ACTIVATORS)) + continue; + + ret = kdbus_list_write(conn, c, slice, + &p, o, write); + if (ret < 0) { + mutex_unlock(&c->lock); + return ret; + } + + added = true; + } else if (o->flags & KDBUS_NAME_IN_QUEUE) { + if (!(flags & KDBUS_LIST_QUEUED)) + continue; + + ret = kdbus_list_write(conn, c, slice, + &p, o, write); + if (ret < 0) { + mutex_unlock(&c->lock); + return ret; + } + + added = true; + } else if (flags & KDBUS_LIST_NAMES) { + ret = kdbus_list_write(conn, c, slice, + &p, o, write); + if (ret < 0) { + mutex_unlock(&c->lock); + return ret; + } + + added = true; + } + } + } + + /* nothing added so far, just add the unique ID */ + if (!added && (flags & KDBUS_LIST_UNIQUE)) { + ret = kdbus_list_write(conn, c, slice, &p, NULL, write); + if (ret < 0) + return ret; + } + } + + *pos = p; + return 0; +} + +/** + * kdbus_cmd_list() - handle KDBUS_CMD_LIST + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_list(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_name_registry *reg = conn->ep->bus->name_registry; + struct kdbus_pool_slice *slice = NULL; + struct kdbus_cmd_list *cmd; + size_t pos, size; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_LIST_UNIQUE | + KDBUS_LIST_NAMES | + KDBUS_LIST_ACTIVATORS | + KDBUS_LIST_QUEUED, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + /* lock order: domain -> bus -> ep -> names -> conn */ + down_read(®->rwlock); + down_read(&conn->ep->bus->conn_rwlock); + down_read(&conn->ep->policy_db.entries_rwlock); + + /* size of records */ + size = 0; + ret = kdbus_list_all(conn, cmd->flags, NULL, &size, false); + if (ret < 0) + goto exit_unlock; + + if (size == 0) { + kdbus_pool_publish_empty(conn->pool, &cmd->offset, + &cmd->list_size); + } else { + slice = kdbus_pool_slice_alloc(conn->pool, size, false); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto exit_unlock; + } + + /* copy the records */ + pos = 0; + ret = kdbus_list_all(conn, cmd->flags, slice, &pos, true); + if (ret < 0) + goto exit_unlock; + + WARN_ON(pos != size); + kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->list_size); + } + + if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) || + kdbus_member_set_user(&cmd->list_size, argp, + typeof(*cmd), list_size)) + ret = -EFAULT; + +exit_unlock: + up_read(&conn->ep->policy_db.entries_rwlock); + up_read(&conn->ep->bus->conn_rwlock); + up_read(®->rwlock); + kdbus_pool_slice_release(slice); + return kdbus_args_clear(&args, ret); +} diff -Nur linux-4.2/ipc/kdbus/names.h linux-4.2-pck/ipc/kdbus/names.h --- linux-4.2/ipc/kdbus/names.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/names.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_NAMES_H +#define __KDBUS_NAMES_H + +#include +#include + +struct kdbus_name_entry; +struct kdbus_name_owner; +struct kdbus_name_registry; + +/** + * struct kdbus_name_registry - names registered for a bus + * @entries_hash: Map of entries + * @lock: Registry data lock + * @name_seq_last: Last used sequence number to assign to a name entry + */ +struct kdbus_name_registry { + DECLARE_HASHTABLE(entries_hash, 8); + struct rw_semaphore rwlock; + u64 name_seq_last; +}; + +/** + * struct kdbus_name_entry - well-know name entry + * @name_id: sequence number of name entry to be able to uniquely + * identify a name over its registration lifetime + * @activator: activator of this name, or NULL + * @queue: list of queued owners + * @hentry: entry in registry map + * @name: well-known name + */ +struct kdbus_name_entry { + u64 name_id; + struct kdbus_name_owner *activator; + struct list_head queue; + struct hlist_node hentry; + char name[]; +}; + +/** + * struct kdbus_name_owner - owner of a well-known name + * @flags: KDBUS_NAME_* flags of this owner + * @conn: connection owning the name + * @name: name that is owned + * @conn_entry: link into @conn + * @name_entry: link into @name + */ +struct kdbus_name_owner { + u64 flags; + struct kdbus_conn *conn; + struct kdbus_name_entry *name; + struct list_head conn_entry; + struct list_head name_entry; +}; + +bool kdbus_name_is_valid(const char *p, bool allow_wildcard); + +struct kdbus_name_registry *kdbus_name_registry_new(void); +void kdbus_name_registry_free(struct kdbus_name_registry *reg); + +struct kdbus_name_entry * +kdbus_name_lookup_unlocked(struct kdbus_name_registry *reg, const char *name); + +int kdbus_name_acquire(struct kdbus_name_registry *reg, + struct kdbus_conn *conn, const char *name, + u64 flags, u64 *return_flags); +void kdbus_name_release_all(struct kdbus_name_registry *reg, + struct kdbus_conn *conn); + +int kdbus_cmd_name_acquire(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_name_release(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_list(struct kdbus_conn *conn, void __user *argp); + +/** + * kdbus_name_get_owner() - get current owner of a name + * @name: name to get current owner of + * + * This returns a pointer to the current owner of a name (or its activator if + * there is no owner). The caller must make sure @name is valid and does not + * vanish. + * + * Return: Pointer to current owner or NULL if there is none. + */ +static inline struct kdbus_name_owner * +kdbus_name_get_owner(struct kdbus_name_entry *name) +{ + return list_first_entry_or_null(&name->queue, struct kdbus_name_owner, + name_entry) ? : name->activator; +} + +#endif diff -Nur linux-4.2/ipc/kdbus/node.c linux-4.2-pck/ipc/kdbus/node.c --- linux-4.2/ipc/kdbus/node.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/node.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,948 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "domain.h" +#include "endpoint.h" +#include "fs.h" +#include "handle.h" +#include "node.h" +#include "util.h" + +/** + * DOC: kdbus nodes + * + * Nodes unify lifetime management across exposed kdbus objects and provide a + * hierarchy. Each kdbus object, that might be exposed to user-space, has a + * kdbus_node object embedded and is linked into the hierarchy. Each node can + * have any number (0-n) of child nodes linked. Each child retains a reference + * to its parent node. For root-nodes, the parent is NULL. + * + * Each node object goes through a bunch of states during it's lifetime: + * * NEW + * * LINKED (can be skipped by NEW->FREED transition) + * * ACTIVE (can be skipped by LINKED->INACTIVE transition) + * * INACTIVE + * * DRAINED + * * FREED + * + * Each node is allocated by the caller and initialized via kdbus_node_init(). + * This never fails and sets the object into state NEW. From now on, ref-counts + * on the node manage its lifetime. During init, the ref-count is set to 1. Once + * it drops to 0, the node goes to state FREED and the node->free_cb() callback + * is called to deallocate any memory. + * + * After initializing a node, you usually link it into the hierarchy. You need + * to provide a parent node and a name. The node will be linked as child to the + * parent and a globally unique ID is assigned to the child. The name of the + * child must be unique for all children of this parent. Otherwise, linking the + * child will fail with -EEXIST. + * Note that the child is not marked active, yet. Admittedly, it prevents any + * other node from being linked with the same name (thus, it reserves that + * name), but any child-lookup (via name or unique ID) will never return this + * child unless it has been marked active. + * + * Once successfully linked, you can use kdbus_node_activate() to activate a + * child. This will mark the child active. This state can be skipped by directly + * deactivating the child via kdbus_node_deactivate() (see below). + * By activating a child, you enable any lookups on this child to succeed from + * now on. Furthermore, any code that got its hands on a reference to the node, + * can from now on "acquire" the node. + * + * Active References (or: 'acquiring' and 'releasing' a node) + * Additionally to normal object references, nodes support something we call + * "active references". An active reference can be acquired via + * kdbus_node_acquire() and released via kdbus_node_release(). A caller + * _must_ own a normal object reference whenever calling those functions. + * Unlike object references, acquiring an active reference can fail (by + * returning 'false' from kdbus_node_acquire()). An active reference can + * only be acquired if the node is marked active. If it is not marked + * active, yet, or if it was already deactivated, no more active references + * can be acquired, ever! + * Active references are used to track tasks working on a node. Whenever a + * task enters kernel-space to perform an action on a node, it acquires an + * active reference, performs the action and releases the reference again. + * While holding an active reference, the node is guaranteed to stay active. + * If the node is deactivated in parallel, the node is marked as + * deactivated, then we wait for all active references to be dropped, before + * we finally proceed with any cleanups. That is, if you hold an active + * reference to a node, any resources that are bound to the "active" state + * are guaranteed to stay accessible until you release your reference. + * + * Active-references are very similar to rw-locks, where acquiring a node is + * equal to try-read-lock and releasing to read-unlock. Deactivating a node + * means write-lock and never releasing it again. + * Unlike rw-locks, the 'active reference' concept is more versatile and + * avoids unusual rw-lock usage (never releasing a write-lock..). + * + * It is safe to acquire multiple active-references recursively. But you + * need to check the return value of kdbus_node_acquire() on _each_ call. It + * may stop granting references at _any_ time. + * + * You're free to perform any operations you want while holding an active + * reference, except sleeping for an indefinite period. Sleeping for a fixed + * amount of time is fine, but you usually should not wait on wait-queues + * without a timeout. + * For example, if you wait for I/O to happen, you should gather all data + * and schedule the I/O operation, then release your active reference and + * wait for it to complete. Then try to acquire a new reference. If it + * fails, perform any cleanup (the node is now dead). Otherwise, you can + * finish your operation. + * + * All nodes can be deactivated via kdbus_node_deactivate() at any time. You can + * call this multiple times, even in parallel or on nodes that were never + * linked, and it will just work. Furthermore, all children will be deactivated + * recursively as well. If a node is deactivated, there might still be active + * references that were acquired before calling kdbus_node_deactivate(). The + * owner of an object must call kdbus_node_drain() (which is a superset of + * kdbus_node_deactivate()) before dropping their reference. This will + * deactivate the node and also synchronously wait for all active references to + * be dropped. Hence, once kdbus_node_drain() returns, the node is fully + * released and no active references exist, anymore. + * kdbus_node_drain() can be called at any times, multiple times, and in + * parallel on multiple threads. All calls are synchronized internally and will + * return only once the node is fully drained. The only restriction is, you + * must not hold an active reference when calling kdbus_node_drain() (unlike + * deactivation, which allows the caller to hold an active reference). + * + * When a node is activated, we acquire a normal object reference to the node. + * This reference is dropped after deactivation is fully done (and only if the + * node really was activated). This allows callers to link+activate a child node + * and then drop all refs. This has the effect that nobody owns a reference to + * the node, except for the parent node. Hence, if the parent is deactivated + * (and thus all children are deactivated, too), this will automatically + * release the child node. + * + * Currently, nodes provide a bunch of resources that external code can use + * directly. This includes: + * + * * node->waitq: Each node has its own wait-queue that is used to manage + * the 'active' state. When a node is deactivated, we wait on + * this queue until all active refs are dropped. Analogously, + * when you release an active reference on a deactivated + * node, and the active ref-count drops to 0, we wake up a + * single thread on this queue. Furthermore, once the + * ->release_cb() callback finished, we wake up all waiters. + * The node-owner is free to re-use this wait-queue for other + * purposes. As node-management uses this queue only during + * deactivation, it is usually totally fine to re-use the + * queue for other, preferably low-overhead, use-cases. + * + * * node->type: This field defines the type of the owner of this node. It + * must be set during node initialization and must remain + * constant. The node management never looks at this value, + * but external users might use to gain access to the owner + * object of a node. + * It is totally up to the owner of the node to define what + * their type means. Usually it means you can access the + * parent structure via container_of(), as long as you hold an + * active reference to the node. + * + * * node->free_cb: callback after all references are dropped + * node->release_cb: callback during node deactivation + * These fields must be set by the node owner during + * node initialization. They must remain constant. If + * NULL, they're skipped. + * + * * node->mode: filesystem access modes + * node->uid: filesystem owner uid + * node->gid: filesystem owner gid + * These fields must be set by the node owner during node + * initialization. They must remain constant and may be + * accessed by other callers to properly initialize + * filesystem nodes. + * + * * node->id: This is an unsigned 32bit integer allocated by an IDA. It is + * always kept as small as possible during allocation and is + * globally unique across all nodes allocated by this module. 0 + * is reserved as "not assigned" and is the default. + * The ID is assigned during kdbus_node_link() and is kept until + * the object is freed. Thus, the ID surpasses the active + * lifetime of a node. As long as you hold an object reference + * to a node (and the node was linked once), the ID is valid and + * unique. + * + * * node->name: name of this node + * node->hash: 31bit hash-value of @name (range [2..INT_MAX-1]) + * These values follow the same lifetime rules as node->id. + * They're initialized when the node is linked and then remain + * constant until the last object reference is dropped. + * Unlike the id, the name is only unique across all siblings + * and only until the node is deactivated. Currently, the name + * is even unique if linked but not activated, yet. This might + * change in the future, though. Code should not rely on this. + * + * * node->lock: lock to protect node->children, node->rb, node->parent + * * node->parent: Reference to parent node. This is set during LINK time + * and is dropped during destruction. You can freely access + * this field, but it may be NULL (root node). + * * node->children: rb-tree of all linked children of this node. You must + * not access this directly, but use one of the iterator + * or lookup helpers. + */ + +/* + * Bias values track states of "active references". They're all negative. If a + * node is active, its active-ref-counter is >=0 and tracks all active + * references. Once a node is deactivaed, we subtract NODE_BIAS. This means, the + * counter is now negative but still counts the active references. Once it drops + * to exactly NODE_BIAS, we know all active references were dropped. Exactly one + * thread will change it to NODE_RELEASE now, perform cleanup and then put it + * into NODE_DRAINED. Once drained, all other threads that tried deactivating + * the node will now be woken up (thus, they wait until the node is fully done). + * The initial state during node-setup is NODE_NEW. If a node is directly + * deactivated without having ever been active, it is put into + * NODE_RELEASE_DIRECT instead of NODE_BIAS. This tracks this one-bit state + * across node-deactivation. The task putting it into NODE_RELEASE now knows + * whether the node was active before or not. + * + * Some archs implement atomic_sub(v) with atomic_add(-v), so reserve INT_MIN + * to avoid overflows if multiplied by -1. + */ +#define KDBUS_NODE_BIAS (INT_MIN + 5) +#define KDBUS_NODE_RELEASE_DIRECT (KDBUS_NODE_BIAS - 1) +#define KDBUS_NODE_RELEASE (KDBUS_NODE_BIAS - 2) +#define KDBUS_NODE_DRAINED (KDBUS_NODE_BIAS - 3) +#define KDBUS_NODE_NEW (KDBUS_NODE_BIAS - 4) + +/* global unique ID mapping for kdbus nodes */ +DEFINE_IDA(kdbus_node_ida); + +/** + * kdbus_node_name_hash() - hash a name + * @name: The string to hash + * + * This computes the hash of @name. It is guaranteed to be in the range + * [2..INT_MAX-1]. The values 1, 2 and INT_MAX are unused as they are reserved + * for the filesystem code. + * + * Return: hash value of the passed string + */ +static unsigned int kdbus_node_name_hash(const char *name) +{ + unsigned int hash; + + /* reserve hash numbers 0, 1 and >=INT_MAX for magic directories */ + hash = kdbus_strhash(name) & INT_MAX; + if (hash < 2) + hash += 2; + if (hash >= INT_MAX) + hash = INT_MAX - 1; + + return hash; +} + +/** + * kdbus_node_name_compare() - compare a name with a node's name + * @hash: hash of the string to compare the node with + * @name: name to compare the node with + * @node: node to compare the name with + * + * This compares a query string against a kdbus node. If the kdbus node has the + * given name, this returns 0. Otherwise, this returns >0 / <0 depending + * whether the query string is greater / less than the node. + * + * Note: If @node is drained but has the name @name, this returns 1. The + * reason for this is that we treat drained nodes as "renamed". The + * slot of such nodes is no longer occupied and new nodes can claim it. + * Obviously, this has the side-effect that you cannot match drained + * nodes, as they will never return 0 on name-matches. But this is + * intentional, as there is no reason why anyone would ever want to match + * on drained nodes. + * + * Return: 0 if @name and @hash exactly match the information in @node, or + * an integer less than or greater than zero if @name is found, respectively, + * to be less than or be greater than the string stored in @node. + */ +static int kdbus_node_name_compare(unsigned int hash, const char *name, + const struct kdbus_node *node) +{ + int ret; + + if (hash != node->hash) + return hash - node->hash; + + ret = strcmp(name, node->name); + if (ret != 0) + return ret; + + return atomic_read(&node->active) == KDBUS_NODE_DRAINED; +} + +/** + * kdbus_node_init() - initialize a kdbus_node + * @node: Pointer to the node to initialize + * @type: The type the node will have (KDBUS_NODE_*) + * + * The caller is responsible of allocating @node and initializating it to zero. + * Once this call returns, you must use the node_ref() and node_unref() + * functions to manage this node. + */ +void kdbus_node_init(struct kdbus_node *node, unsigned int type) +{ + atomic_set(&node->refcnt, 1); + mutex_init(&node->lock); + node->id = 0; + node->type = type; + RB_CLEAR_NODE(&node->rb); + node->children = RB_ROOT; + init_waitqueue_head(&node->waitq); + atomic_set(&node->active, KDBUS_NODE_NEW); +} + +/** + * kdbus_node_link() - link a node into the nodes system + * @node: Pointer to the node to initialize + * @parent: Pointer to a parent node, may be %NULL + * @name: The name of the node (or NULL if root node) + * + * This links a node into the hierarchy. This must not be called multiple times. + * If @parent is NULL, the node becomes a new root node. + * + * This call will fail if @name is not unique across all its siblings or if no + * ID could be allocated. You must not activate a node if linking failed! It is + * safe to deactivate it, though. + * + * Once you linked a node, you must call kdbus_node_drain() before you drop + * the last reference (even if you never activate the node). + * + * Return: 0 on success. negative error otherwise. + */ +int kdbus_node_link(struct kdbus_node *node, struct kdbus_node *parent, + const char *name) +{ + int ret; + + if (WARN_ON(node->type != KDBUS_NODE_DOMAIN && !parent)) + return -EINVAL; + + if (WARN_ON(parent && !name)) + return -EINVAL; + + if (name) { + node->name = kstrdup(name, GFP_KERNEL); + if (!node->name) + return -ENOMEM; + + node->hash = kdbus_node_name_hash(name); + } + + ret = ida_simple_get(&kdbus_node_ida, 1, 0, GFP_KERNEL); + if (ret < 0) + return ret; + + node->id = ret; + ret = 0; + + if (parent) { + struct rb_node **n, *prev; + + if (!kdbus_node_acquire(parent)) + return -ESHUTDOWN; + + mutex_lock(&parent->lock); + + n = &parent->children.rb_node; + prev = NULL; + + while (*n) { + struct kdbus_node *pos; + int result; + + pos = kdbus_node_from_rb(*n); + prev = *n; + result = kdbus_node_name_compare(node->hash, + node->name, + pos); + if (result == 0) { + ret = -EEXIST; + goto exit_unlock; + } + + if (result < 0) + n = &pos->rb.rb_left; + else + n = &pos->rb.rb_right; + } + + /* add new node and rebalance the tree */ + rb_link_node(&node->rb, prev, n); + rb_insert_color(&node->rb, &parent->children); + node->parent = kdbus_node_ref(parent); + +exit_unlock: + mutex_unlock(&parent->lock); + kdbus_node_release(parent); + } + + return ret; +} + +/** + * kdbus_node_ref() - Acquire object reference + * @node: node to acquire reference to (or NULL) + * + * This acquires a new reference to @node. You must already own a reference when + * calling this! + * If @node is NULL, this is a no-op. + * + * Return: @node is returned + */ +struct kdbus_node *kdbus_node_ref(struct kdbus_node *node) +{ + if (node) + atomic_inc(&node->refcnt); + return node; +} + +/** + * kdbus_node_unref() - Drop object reference + * @node: node to drop reference to (or NULL) + * + * This drops an object reference to @node. You must not access the node if you + * no longer own a reference. + * If the ref-count drops to 0, the object will be destroyed (->free_cb will be + * called). + * + * If you linked or activated the node, you must deactivate the node before you + * drop your last reference! If you didn't link or activate the node, you can + * drop any reference you want. + * + * Note that this calls into ->free_cb() and thus _might_ sleep. The ->free_cb() + * callbacks must not acquire any outer locks, though. So you can safely drop + * references while holding locks (apart from node->parent->lock). + * + * If @node is NULL, this is a no-op. + * + * Return: This always returns NULL + */ +struct kdbus_node *kdbus_node_unref(struct kdbus_node *node) +{ + if (node && atomic_dec_and_test(&node->refcnt)) { + struct kdbus_node safe = *node; + + WARN_ON(atomic_read(&node->active) != KDBUS_NODE_DRAINED); + + if (node->parent) { + mutex_lock(&node->parent->lock); + if (!RB_EMPTY_NODE(&node->rb)) { + rb_erase(&node->rb, + &node->parent->children); + RB_CLEAR_NODE(&node->rb); + } + mutex_unlock(&node->parent->lock); + } + + if (node->free_cb) + node->free_cb(node); + if (safe.id > 0) + ida_simple_remove(&kdbus_node_ida, safe.id); + + kfree(safe.name); + kdbus_node_unref(safe.parent); + } + + return NULL; +} + +/** + * kdbus_node_is_active() - test whether a node is active + * @node: node to test + * + * This checks whether @node is active. That means, @node was linked and + * activated by the node owner and hasn't been deactivated, yet. If, and only + * if, a node is active, kdbus_node_acquire() will be able to acquire active + * references. + * + * Note that this function does not give any lifetime guarantees. After this + * call returns, the node might be deactivated immediately. Normally, what you + * want is to acquire a real active reference via kdbus_node_acquire(). + * + * Return: true if @node is active, false otherwise + */ +bool kdbus_node_is_active(struct kdbus_node *node) +{ + return atomic_read(&node->active) >= 0; +} + +/** + * kdbus_node_is_deactivated() - test whether a node was already deactivated + * @node: node to test + * + * This checks whether kdbus_node_deactivate() was called on @node. Note that + * this might be true even if you never deactivated the node directly, but only + * one of its ancestors. + * + * Note that even if this returns 'false', the node might get deactivated + * immediately after the call returns. + * + * Return: true if @node was already deactivated, false if not + */ +bool kdbus_node_is_deactivated(struct kdbus_node *node) +{ + int v; + + v = atomic_read(&node->active); + return v != KDBUS_NODE_NEW && v < 0; +} + +/** + * kdbus_node_activate() - activate a node + * @node: node to activate + * + * This marks @node as active if, and only if, the node wasn't activated nor + * deactivated, yet, and the parent is still active. Any but the first call to + * kdbus_node_activate() is a no-op. + * If you called kdbus_node_deactivate() before, then even the first call to + * kdbus_node_activate() will be a no-op. + * + * This call doesn't give any lifetime guarantees. The node might get + * deactivated immediately after this call returns. Or the parent might already + * be deactivated, which will make this call a no-op. + * + * If this call successfully activated a node, it will take an object reference + * to it. This reference is dropped after the node is deactivated. Therefore, + * the object owner can safely drop their reference to @node iff they know that + * its parent node will get deactivated at some point. Once the parent node is + * deactivated, it will deactivate all its child and thus drop this reference + * again. + * + * Return: True if this call successfully activated the node, otherwise false. + * Note that this might return false, even if the node is still active + * (eg., if you called this a second time). + */ +bool kdbus_node_activate(struct kdbus_node *node) +{ + bool res = false; + + mutex_lock(&node->lock); + if (atomic_read(&node->active) == KDBUS_NODE_NEW) { + atomic_sub(KDBUS_NODE_NEW, &node->active); + /* activated nodes have ref +1 */ + kdbus_node_ref(node); + res = true; + } + mutex_unlock(&node->lock); + + return res; +} + +/** + * kdbus_node_recurse_unlock() - advance iterator on a tree + * @start: node at which the iteration started + * @node: previously visited node + * + * This helper advances an iterator by one, when traversing a node tree. It is + * supposed to be used like this: + * + * struct kdbus_node *n; + * + * n = start; + * while (n) { + * mutex_lock(&n->lock); + * ... visit @n ... + * n = kdbus_node_recurse_unlock(start, n); + * } + * + * This helpers takes as input the start-node of the iteration and the current + * position. It returns a pointer to the next node to visit. The caller must + * hold a reference to @start during the whole iteration. Furthermore, @node + * must be locked when entering this helper. On return, the lock is released. + * + * The order of visit is pre-order traversal. + * + * If @node is deactivated before recursing its children, then it is guaranteed + * that all children will be visited. If @node is still active, new nodes might + * be inserted during traversal, and thus might be missed. + * + * Also note that the node-locks are released while traversing children. You + * must not rely on the locks to be held during the whole traversal. Each node + * that is visited is pinned by this helper, so the caller can rely on owning a + * reference. It is dropped, once all of the children of the node have been + * visited (recursively). + * + * You *must not* bail out of a traversal early, otherwise you'll leak + * ref-counts to all nodes in the current depth-path. + * + * Return: Reference to next node, or NULL. + */ +static struct kdbus_node *kdbus_node_recurse_unlock(struct kdbus_node *start, + struct kdbus_node *node) +{ + struct kdbus_node *t, *prev = NULL; + struct rb_node *rb; + + lockdep_assert_held(&node->lock); + + rb = rb_first(&node->children); + if (!rb) { + do { + mutex_unlock(&node->lock); + kdbus_node_unref(prev); + + if (node == start) + return NULL; + + prev = node; + node = node->parent; + + mutex_lock(&node->lock); + rb = rb_next(&prev->rb); + } while (!rb); + } + + t = kdbus_node_ref(kdbus_node_from_rb(rb)); + mutex_unlock(&node->lock); + kdbus_node_unref(prev); + return t; +} + +/** + * kdbus_node_deactivate() - deactivate a node + * @node: node to deactivate + * + * This recursively deactivates the passed node and all its children. The nodes + * are marked as deactivated, but they're not drained. Hence, even after this + * call returns, there might still be someone holding an active reference to + * any of the nodes. However, no new active references can be acquired after + * this returns. + * + * It is safe to call this multiple times (even in parallel). Each call is + * guaranteed to only return after _all_ nodes have been deactivated. + */ +void kdbus_node_deactivate(struct kdbus_node *node) +{ + struct kdbus_node *pos; + int v; + + pos = node; + while (pos) { + mutex_lock(&pos->lock); + + /* + * Add BIAS to pos->active to mark it as inactive. If it was + * never active before, immediately mark it as RELEASE_INACTIVE + * so that this case can be detected later on. + * If the node was already deactivated, make sure to still + * recurse into the children. Otherwise, we might return before + * a racing thread finished deactivating all children. But we + * want to guarantee that the whole tree is deactivated once + * this returns. + */ + v = atomic_read(&pos->active); + if (v >= 0) + atomic_add_return(KDBUS_NODE_BIAS, &pos->active); + else if (v == KDBUS_NODE_NEW) + atomic_set(&pos->active, KDBUS_NODE_RELEASE_DIRECT); + + pos = kdbus_node_recurse_unlock(node, pos); + } +} + +/** + * kdbus_node_drain() - drain a node + * @node: node to drain + * + * This function recursively deactivates this node and all its children and + * then waits for all active references to be dropped. This function is a + * superset of kdbus_node_deactivate(), as it additionally drains all nodes. It + * returns only once all children and the node itself were recursively drained + * (even if you call this function multiple times in parallel). + * + * It is safe to call this function on _any_ node that was initialized _any_ + * number of times. + * + * This call may sleep, as it waits for all active references to be dropped. + */ +void kdbus_node_drain(struct kdbus_node *node) +{ + struct kdbus_node *pos; + int v; + + kdbus_node_deactivate(node); + + pos = node; + while (pos) { + /* wait until all active references were dropped */ + wait_event(pos->waitq, + atomic_read(&pos->active) <= KDBUS_NODE_BIAS); + + /* mark object as RELEASE */ + mutex_lock(&pos->lock); + v = atomic_read(&pos->active); + if (v == KDBUS_NODE_BIAS || v == KDBUS_NODE_RELEASE_DIRECT) + atomic_set(&pos->active, KDBUS_NODE_RELEASE); + mutex_unlock(&pos->lock); + + /* + * If this is the thread that marked the object as RELEASE, we + * perform the actual release. Otherwise, we wait until the + * release is done and the node is marked as DRAINED. + */ + if (v == KDBUS_NODE_BIAS || v == KDBUS_NODE_RELEASE_DIRECT) { + if (pos->release_cb) + pos->release_cb(pos, v == KDBUS_NODE_BIAS); + + /* mark as DRAINED */ + atomic_set(&pos->active, KDBUS_NODE_DRAINED); + wake_up_all(&pos->waitq); + + /* drop VFS cache */ + kdbus_fs_flush(pos); + + /* + * If the node was activated and someone subtracted BIAS + * from it to deactivate it, we, and only us, are + * responsible to release the extra ref-count that was + * taken once in kdbus_node_activate(). + * If the node was never activated, no-one ever + * subtracted BIAS, but instead skipped that state and + * immediately went to NODE_RELEASE_DIRECT. In that case + * we must not drop the reference. + */ + if (v == KDBUS_NODE_BIAS) + kdbus_node_unref(pos); + } else { + /* wait until object is DRAINED */ + wait_event(pos->waitq, + atomic_read(&pos->active) == KDBUS_NODE_DRAINED); + } + + mutex_lock(&pos->lock); + pos = kdbus_node_recurse_unlock(node, pos); + } +} + +/** + * kdbus_node_acquire() - Acquire an active ref on a node + * @node: The node + * + * This acquires an active-reference to @node. This will only succeed if the + * node is active. You must release this active reference via + * kdbus_node_release() again. + * + * See the introduction to "active references" for more details. + * + * Return: %true if @node was non-NULL and active + */ +bool kdbus_node_acquire(struct kdbus_node *node) +{ + return node && atomic_inc_unless_negative(&node->active); +} + +/** + * kdbus_node_release() - Release an active ref on a node + * @node: The node + * + * This releases an active reference that was previously acquired via + * kdbus_node_acquire(). See kdbus_node_acquire() for details. + */ +void kdbus_node_release(struct kdbus_node *node) +{ + if (node && atomic_dec_return(&node->active) == KDBUS_NODE_BIAS) + wake_up(&node->waitq); +} + +/** + * kdbus_node_find_child() - Find child by name + * @node: parent node to search through + * @name: name of child node + * + * This searches through all children of @node for a child-node with name @name. + * If not found, or if the child is deactivated, NULL is returned. Otherwise, + * the child is acquired and a new reference is returned. + * + * If you're done with the child, you need to release it and drop your + * reference. + * + * This function does not acquire the parent node. However, if the parent was + * already deactivated, then kdbus_node_deactivate() will, at some point, also + * deactivate the child. Therefore, we can rely on the explicit ordering during + * deactivation. + * + * Return: Reference to acquired child node, or NULL if not found / not active. + */ +struct kdbus_node *kdbus_node_find_child(struct kdbus_node *node, + const char *name) +{ + struct kdbus_node *child; + struct rb_node *rb; + unsigned int hash; + int ret; + + hash = kdbus_node_name_hash(name); + + mutex_lock(&node->lock); + rb = node->children.rb_node; + while (rb) { + child = kdbus_node_from_rb(rb); + ret = kdbus_node_name_compare(hash, name, child); + if (ret < 0) + rb = rb->rb_left; + else if (ret > 0) + rb = rb->rb_right; + else + break; + } + if (rb && kdbus_node_acquire(child)) + kdbus_node_ref(child); + else + child = NULL; + mutex_unlock(&node->lock); + + return child; +} + +static struct kdbus_node *node_find_closest_unlocked(struct kdbus_node *node, + unsigned int hash, + const char *name) +{ + struct kdbus_node *n, *pos = NULL; + struct rb_node *rb; + int res; + + /* + * Find the closest child with ``node->hash >= hash'', or, if @name is + * valid, ``node->name >= name'' (where '>=' is the lex. order). + */ + + rb = node->children.rb_node; + while (rb) { + n = kdbus_node_from_rb(rb); + + if (name) + res = kdbus_node_name_compare(hash, name, n); + else + res = hash - n->hash; + + if (res <= 0) { + rb = rb->rb_left; + pos = n; + } else { /* ``hash > n->hash'', ``name > n->name'' */ + rb = rb->rb_right; + } + } + + return pos; +} + +/** + * kdbus_node_find_closest() - Find closest child-match + * @node: parent node to search through + * @hash: hash value to find closest match for + * + * Find the closest child of @node with a hash greater than or equal to @hash. + * The closest match is the left-most child of @node with this property. Which + * means, it is the first child with that hash returned by + * kdbus_node_next_child(), if you'd iterate the whole parent node. + * + * Return: Reference to acquired child, or NULL if none found. + */ +struct kdbus_node *kdbus_node_find_closest(struct kdbus_node *node, + unsigned int hash) +{ + struct kdbus_node *child; + struct rb_node *rb; + + mutex_lock(&node->lock); + + child = node_find_closest_unlocked(node, hash, NULL); + while (child && !kdbus_node_acquire(child)) { + rb = rb_next(&child->rb); + if (rb) + child = kdbus_node_from_rb(rb); + else + child = NULL; + } + kdbus_node_ref(child); + + mutex_unlock(&node->lock); + + return child; +} + +/** + * kdbus_node_next_child() - Acquire next child + * @node: parent node + * @prev: previous child-node position or NULL + * + * This function returns a reference to the next active child of @node, after + * the passed position @prev. If @prev is NULL, a reference to the first active + * child is returned. If no more active children are found, NULL is returned. + * + * This function acquires the next child it returns. If you're done with the + * returned pointer, you need to release _and_ unref it. + * + * The passed in pointer @prev is not modified by this function, and it does + * *not* have to be active. If @prev was acquired via different means, or if it + * was unlinked from its parent before you pass it in, then this iterator will + * still return the next active child (it will have to search through the + * rb-tree based on the node-name, though). + * However, @prev must not be linked to a different parent than @node! + * + * Return: Reference to next acquired child, or NULL if at the end. + */ +struct kdbus_node *kdbus_node_next_child(struct kdbus_node *node, + struct kdbus_node *prev) +{ + struct kdbus_node *pos = NULL; + struct rb_node *rb; + + mutex_lock(&node->lock); + + if (!prev) { + /* + * New iteration; find first node in rb-tree and try to acquire + * it. If we got it, directly return it as first element. + * Otherwise, the loop below will find the next active node. + */ + rb = rb_first(&node->children); + if (!rb) + goto exit; + pos = kdbus_node_from_rb(rb); + if (kdbus_node_acquire(pos)) + goto exit; + } else { + /* + * The current iterator is still linked to the parent. Set it + * as current position and use the loop below to find the next + * active element. + */ + pos = prev; + } + + /* @pos was already returned or is inactive; find next active node */ + do { + rb = rb_next(&pos->rb); + if (rb) + pos = kdbus_node_from_rb(rb); + else + pos = NULL; + } while (pos && !kdbus_node_acquire(pos)); + +exit: + /* @pos is NULL or acquired. Take ref if non-NULL and return it */ + kdbus_node_ref(pos); + mutex_unlock(&node->lock); + return pos; +} diff -Nur linux-4.2/ipc/kdbus/node.h linux-4.2-pck/ipc/kdbus/node.h --- linux-4.2/ipc/kdbus/node.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/node.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_NODE_H +#define __KDBUS_NODE_H + +#include +#include +#include +#include + +struct kdbus_node; + +enum kdbus_node_type { + KDBUS_NODE_DOMAIN, + KDBUS_NODE_CONTROL, + KDBUS_NODE_BUS, + KDBUS_NODE_ENDPOINT, +}; + +typedef void (*kdbus_node_free_t) (struct kdbus_node *node); +typedef void (*kdbus_node_release_t) (struct kdbus_node *node, bool was_active); + +struct kdbus_node { + struct mutex lock; + atomic_t refcnt; + atomic_t active; + wait_queue_head_t waitq; + + /* static members */ + unsigned int type; + kdbus_node_free_t free_cb; + kdbus_node_release_t release_cb; + umode_t mode; + kuid_t uid; + kgid_t gid; + + /* valid once linked */ + char *name; + unsigned int hash; + unsigned int id; + struct kdbus_node *parent; /* may be NULL */ + struct rb_node rb; + + /* dynamic list of children */ + struct rb_root children; +}; + +#define kdbus_node_from_rb(_node) rb_entry((_node), struct kdbus_node, rb) + +extern struct ida kdbus_node_ida; + +void kdbus_node_init(struct kdbus_node *node, unsigned int type); + +int kdbus_node_link(struct kdbus_node *node, struct kdbus_node *parent, + const char *name); + +struct kdbus_node *kdbus_node_ref(struct kdbus_node *node); +struct kdbus_node *kdbus_node_unref(struct kdbus_node *node); + +bool kdbus_node_is_active(struct kdbus_node *node); +bool kdbus_node_is_deactivated(struct kdbus_node *node); +bool kdbus_node_activate(struct kdbus_node *node); +void kdbus_node_deactivate(struct kdbus_node *node); +void kdbus_node_drain(struct kdbus_node *node); + +bool kdbus_node_acquire(struct kdbus_node *node); +void kdbus_node_release(struct kdbus_node *node); + +struct kdbus_node *kdbus_node_find_child(struct kdbus_node *node, + const char *name); +struct kdbus_node *kdbus_node_find_closest(struct kdbus_node *node, + unsigned int hash); +struct kdbus_node *kdbus_node_next_child(struct kdbus_node *node, + struct kdbus_node *prev); + +#endif diff -Nur linux-4.2/ipc/kdbus/notify.c linux-4.2-pck/ipc/kdbus/notify.c --- linux-4.2/ipc/kdbus/notify.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/notify.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,204 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "domain.h" +#include "endpoint.h" +#include "item.h" +#include "message.h" +#include "notify.h" + +static inline void kdbus_notify_add_tail(struct kdbus_staging *staging, + struct kdbus_bus *bus) +{ + spin_lock(&bus->notify_lock); + list_add_tail(&staging->notify_entry, &bus->notify_list); + spin_unlock(&bus->notify_lock); +} + +static int kdbus_notify_reply(struct kdbus_bus *bus, u64 id, + u64 cookie, u64 msg_type) +{ + struct kdbus_staging *s; + + s = kdbus_staging_new_kernel(bus, id, cookie, 0, msg_type); + if (IS_ERR(s)) + return PTR_ERR(s); + + kdbus_notify_add_tail(s, bus); + return 0; +} + +/** + * kdbus_notify_reply_timeout() - queue a timeout reply + * @bus: Bus which queues the messages + * @id: The destination's connection ID + * @cookie: The cookie to set in the reply. + * + * Queues a message that has a KDBUS_ITEM_REPLY_TIMEOUT item attached. + * + * Return: 0 on success, negative errno on failure. + */ +int kdbus_notify_reply_timeout(struct kdbus_bus *bus, u64 id, u64 cookie) +{ + return kdbus_notify_reply(bus, id, cookie, KDBUS_ITEM_REPLY_TIMEOUT); +} + +/** + * kdbus_notify_reply_dead() - queue a 'dead' reply + * @bus: Bus which queues the messages + * @id: The destination's connection ID + * @cookie: The cookie to set in the reply. + * + * Queues a message that has a KDBUS_ITEM_REPLY_DEAD item attached. + * + * Return: 0 on success, negative errno on failure. + */ +int kdbus_notify_reply_dead(struct kdbus_bus *bus, u64 id, u64 cookie) +{ + return kdbus_notify_reply(bus, id, cookie, KDBUS_ITEM_REPLY_DEAD); +} + +/** + * kdbus_notify_name_change() - queue a notification about a name owner change + * @bus: Bus which queues the messages + * @type: The type if the notification; KDBUS_ITEM_NAME_ADD, + * KDBUS_ITEM_NAME_CHANGE or KDBUS_ITEM_NAME_REMOVE + * @old_id: The id of the connection that used to own the name + * @new_id: The id of the new owner connection + * @old_flags: The flags to pass in the KDBUS_ITEM flags field for + * the old owner + * @new_flags: The flags to pass in the KDBUS_ITEM flags field for + * the new owner + * @name: The name that was removed or assigned to a new owner + * + * Return: 0 on success, negative errno on failure. + */ +int kdbus_notify_name_change(struct kdbus_bus *bus, u64 type, + u64 old_id, u64 new_id, + u64 old_flags, u64 new_flags, + const char *name) +{ + size_t name_len, extra_size; + struct kdbus_staging *s; + + name_len = strlen(name) + 1; + extra_size = sizeof(struct kdbus_notify_name_change) + name_len; + + s = kdbus_staging_new_kernel(bus, KDBUS_DST_ID_BROADCAST, 0, + extra_size, type); + if (IS_ERR(s)) + return PTR_ERR(s); + + s->notify->name_change.old_id.id = old_id; + s->notify->name_change.old_id.flags = old_flags; + s->notify->name_change.new_id.id = new_id; + s->notify->name_change.new_id.flags = new_flags; + memcpy(s->notify->name_change.name, name, name_len); + + kdbus_notify_add_tail(s, bus); + return 0; +} + +/** + * kdbus_notify_id_change() - queue a notification about a unique ID change + * @bus: Bus which queues the messages + * @type: The type if the notification; KDBUS_ITEM_ID_ADD or + * KDBUS_ITEM_ID_REMOVE + * @id: The id of the connection that was added or removed + * @flags: The flags to pass in the KDBUS_ITEM flags field + * + * Return: 0 on success, negative errno on failure. + */ +int kdbus_notify_id_change(struct kdbus_bus *bus, u64 type, u64 id, u64 flags) +{ + struct kdbus_staging *s; + size_t extra_size; + + extra_size = sizeof(struct kdbus_notify_id_change); + s = kdbus_staging_new_kernel(bus, KDBUS_DST_ID_BROADCAST, 0, + extra_size, type); + if (IS_ERR(s)) + return PTR_ERR(s); + + s->notify->id_change.id = id; + s->notify->id_change.flags = flags; + + kdbus_notify_add_tail(s, bus); + return 0; +} + +/** + * kdbus_notify_flush() - send a list of collected messages + * @bus: Bus which queues the messages + * + * The list is empty after sending the messages. + */ +void kdbus_notify_flush(struct kdbus_bus *bus) +{ + LIST_HEAD(notify_list); + struct kdbus_staging *s, *tmp; + + mutex_lock(&bus->notify_flush_lock); + down_read(&bus->name_registry->rwlock); + + spin_lock(&bus->notify_lock); + list_splice_init(&bus->notify_list, ¬ify_list); + spin_unlock(&bus->notify_lock); + + list_for_each_entry_safe(s, tmp, ¬ify_list, notify_entry) { + if (s->msg->dst_id != KDBUS_DST_ID_BROADCAST) { + struct kdbus_conn *conn; + + conn = kdbus_bus_find_conn_by_id(bus, s->msg->dst_id); + if (conn) { + kdbus_bus_eavesdrop(bus, NULL, s); + kdbus_conn_entry_insert(NULL, conn, s, NULL, + NULL); + kdbus_conn_unref(conn); + } + } else { + kdbus_bus_broadcast(bus, NULL, s); + } + + list_del(&s->notify_entry); + kdbus_staging_free(s); + } + + up_read(&bus->name_registry->rwlock); + mutex_unlock(&bus->notify_flush_lock); +} + +/** + * kdbus_notify_free() - free a list of collected messages + * @bus: Bus which queues the messages + */ +void kdbus_notify_free(struct kdbus_bus *bus) +{ + struct kdbus_staging *s, *tmp; + + list_for_each_entry_safe(s, tmp, &bus->notify_list, notify_entry) { + list_del(&s->notify_entry); + kdbus_staging_free(s); + } +} diff -Nur linux-4.2/ipc/kdbus/notify.h linux-4.2-pck/ipc/kdbus/notify.h --- linux-4.2/ipc/kdbus/notify.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/notify.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_NOTIFY_H +#define __KDBUS_NOTIFY_H + +struct kdbus_bus; + +int kdbus_notify_id_change(struct kdbus_bus *bus, u64 type, u64 id, u64 flags); +int kdbus_notify_reply_timeout(struct kdbus_bus *bus, u64 id, u64 cookie); +int kdbus_notify_reply_dead(struct kdbus_bus *bus, u64 id, u64 cookie); +int kdbus_notify_name_change(struct kdbus_bus *bus, u64 type, + u64 old_id, u64 new_id, + u64 old_flags, u64 new_flags, + const char *name); +void kdbus_notify_flush(struct kdbus_bus *bus); +void kdbus_notify_free(struct kdbus_bus *bus); + +#endif diff -Nur linux-4.2/ipc/kdbus/policy.c linux-4.2-pck/ipc/kdbus/policy.c --- linux-4.2/ipc/kdbus/policy.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/policy.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,489 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "domain.h" +#include "item.h" +#include "names.h" +#include "policy.h" + +#define KDBUS_POLICY_HASH_SIZE 64 + +/** + * struct kdbus_policy_db_entry_access - a database entry access item + * @type: One of KDBUS_POLICY_ACCESS_* types + * @access: Access to grant. One of KDBUS_POLICY_* + * @uid: For KDBUS_POLICY_ACCESS_USER, the global uid + * @gid: For KDBUS_POLICY_ACCESS_GROUP, the global gid + * @list: List entry item for the entry's list + * + * This is the internal version of struct kdbus_policy_db_access. + */ +struct kdbus_policy_db_entry_access { + u8 type; /* USER, GROUP, WORLD */ + u8 access; /* OWN, TALK, SEE */ + union { + kuid_t uid; /* global uid */ + kgid_t gid; /* global gid */ + }; + struct list_head list; +}; + +/** + * struct kdbus_policy_db_entry - a policy database entry + * @name: The name to match the policy entry against + * @hentry: The hash entry for the database's entries_hash + * @access_list: List head for keeping tracks of the entry's + * access items. + * @owner: The owner of this entry. Can be a kdbus_conn or + * a kdbus_ep object. + * @wildcard: The name is a wildcard, such as ending on '.*' + */ +struct kdbus_policy_db_entry { + char *name; + struct hlist_node hentry; + struct list_head access_list; + const void *owner; + bool wildcard:1; +}; + +static void kdbus_policy_entry_free(struct kdbus_policy_db_entry *e) +{ + struct kdbus_policy_db_entry_access *a, *tmp; + + list_for_each_entry_safe(a, tmp, &e->access_list, list) { + list_del(&a->list); + kfree(a); + } + + kfree(e->name); + kfree(e); +} + +static unsigned int kdbus_strnhash(const char *str, size_t len) +{ + unsigned long hash = init_name_hash(); + + while (len--) + hash = partial_name_hash(*str++, hash); + + return end_name_hash(hash); +} + +static const struct kdbus_policy_db_entry * +kdbus_policy_lookup(struct kdbus_policy_db *db, const char *name, u32 hash) +{ + struct kdbus_policy_db_entry *e; + const char *dot; + size_t len; + + /* find exact match */ + hash_for_each_possible(db->entries_hash, e, hentry, hash) + if (strcmp(e->name, name) == 0 && !e->wildcard) + return e; + + /* find wildcard match */ + + dot = strrchr(name, '.'); + if (!dot) + return NULL; + + len = dot - name; + hash = kdbus_strnhash(name, len); + + hash_for_each_possible(db->entries_hash, e, hentry, hash) + if (e->wildcard && !strncmp(e->name, name, len) && + !e->name[len]) + return e; + + return NULL; +} + +/** + * kdbus_policy_db_clear - release all memory from a policy db + * @db: The policy database + */ +void kdbus_policy_db_clear(struct kdbus_policy_db *db) +{ + struct kdbus_policy_db_entry *e; + struct hlist_node *tmp; + unsigned int i; + + /* purge entries */ + down_write(&db->entries_rwlock); + hash_for_each_safe(db->entries_hash, i, tmp, e, hentry) { + hash_del(&e->hentry); + kdbus_policy_entry_free(e); + } + up_write(&db->entries_rwlock); +} + +/** + * kdbus_policy_db_init() - initialize a new policy database + * @db: The location of the database + * + * This initializes a new policy-db. The underlying memory must have been + * cleared to zero by the caller. + */ +void kdbus_policy_db_init(struct kdbus_policy_db *db) +{ + hash_init(db->entries_hash); + init_rwsem(&db->entries_rwlock); +} + +/** + * kdbus_policy_query_unlocked() - Query the policy database + * @db: Policy database + * @cred: Credentials to test against + * @name: Name to query + * @hash: Hash value of @name + * + * Same as kdbus_policy_query() but requires the caller to lock the policy + * database against concurrent writes. + * + * Return: The highest KDBUS_POLICY_* access type found, or -EPERM if none. + */ +int kdbus_policy_query_unlocked(struct kdbus_policy_db *db, + const struct cred *cred, const char *name, + unsigned int hash) +{ + struct kdbus_policy_db_entry_access *a; + const struct kdbus_policy_db_entry *e; + int i, highest = -EPERM; + + e = kdbus_policy_lookup(db, name, hash); + if (!e) + return -EPERM; + + list_for_each_entry(a, &e->access_list, list) { + if ((int)a->access <= highest) + continue; + + switch (a->type) { + case KDBUS_POLICY_ACCESS_USER: + if (uid_eq(cred->euid, a->uid)) + highest = a->access; + break; + case KDBUS_POLICY_ACCESS_GROUP: + if (gid_eq(cred->egid, a->gid)) { + highest = a->access; + break; + } + + for (i = 0; i < cred->group_info->ngroups; i++) { + kgid_t gid = GROUP_AT(cred->group_info, i); + + if (gid_eq(gid, a->gid)) { + highest = a->access; + break; + } + } + + break; + case KDBUS_POLICY_ACCESS_WORLD: + highest = a->access; + break; + } + + /* OWN is the highest possible policy */ + if (highest >= KDBUS_POLICY_OWN) + break; + } + + return highest; +} + +/** + * kdbus_policy_query() - Query the policy database + * @db: Policy database + * @cred: Credentials to test against + * @name: Name to query + * @hash: Hash value of @name + * + * Query the policy database @db for the access rights of @cred to the name + * @name. The access rights of @cred are returned, or -EPERM if no access is + * granted. + * + * This call effectively searches for the highest access-right granted to + * @cred. The caller should really cache those as policy lookups are rather + * expensive. + * + * Return: The highest KDBUS_POLICY_* access type found, or -EPERM if none. + */ +int kdbus_policy_query(struct kdbus_policy_db *db, const struct cred *cred, + const char *name, unsigned int hash) +{ + int ret; + + down_read(&db->entries_rwlock); + ret = kdbus_policy_query_unlocked(db, cred, name, hash); + up_read(&db->entries_rwlock); + + return ret; +} + +static void __kdbus_policy_remove_owner(struct kdbus_policy_db *db, + const void *owner) +{ + struct kdbus_policy_db_entry *e; + struct hlist_node *tmp; + int i; + + hash_for_each_safe(db->entries_hash, i, tmp, e, hentry) + if (e->owner == owner) { + hash_del(&e->hentry); + kdbus_policy_entry_free(e); + } +} + +/** + * kdbus_policy_remove_owner() - remove all entries related to a connection + * @db: The policy database + * @owner: The connection which items to remove + */ +void kdbus_policy_remove_owner(struct kdbus_policy_db *db, + const void *owner) +{ + down_write(&db->entries_rwlock); + __kdbus_policy_remove_owner(db, owner); + up_write(&db->entries_rwlock); +} + +/* + * Convert user provided policy access to internal kdbus policy + * access + */ +static struct kdbus_policy_db_entry_access * +kdbus_policy_make_access(const struct kdbus_policy_access *uaccess) +{ + int ret; + struct kdbus_policy_db_entry_access *a; + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return ERR_PTR(-ENOMEM); + + ret = -EINVAL; + switch (uaccess->access) { + case KDBUS_POLICY_SEE: + case KDBUS_POLICY_TALK: + case KDBUS_POLICY_OWN: + a->access = uaccess->access; + break; + default: + goto err; + } + + switch (uaccess->type) { + case KDBUS_POLICY_ACCESS_USER: + a->uid = make_kuid(current_user_ns(), uaccess->id); + if (!uid_valid(a->uid)) + goto err; + + break; + case KDBUS_POLICY_ACCESS_GROUP: + a->gid = make_kgid(current_user_ns(), uaccess->id); + if (!gid_valid(a->gid)) + goto err; + + break; + case KDBUS_POLICY_ACCESS_WORLD: + break; + default: + goto err; + } + + a->type = uaccess->type; + + return a; + +err: + kfree(a); + return ERR_PTR(ret); +} + +/** + * kdbus_policy_set() - set a connection's policy rules + * @db: The policy database + * @items: A list of kdbus_item elements that contain both + * names and access rules to set. + * @items_size: The total size of the items. + * @max_policies: The maximum number of policy entries to allow. + * Pass 0 for no limit. + * @allow_wildcards: Boolean value whether wildcard entries (such + * ending on '.*') should be allowed. + * @owner: The owner of the new policy items. + * + * This function sets a new set of policies for a given owner. The names and + * access rules are gathered by walking the list of items passed in as + * argument. An item of type KDBUS_ITEM_NAME is expected before any number of + * KDBUS_ITEM_POLICY_ACCESS items. If there are more repetitions of this + * pattern than denoted in @max_policies, -EINVAL is returned. + * + * In order to allow atomic replacement of rules, the function first removes + * all entries that have been created for the given owner previously. + * + * Callers to this function must make sure that the owner is a custom + * endpoint, or if the endpoint is a default endpoint, then it must be + * either a policy holder or an activator. + * + * Return: 0 on success, negative errno on failure. + */ +int kdbus_policy_set(struct kdbus_policy_db *db, + const struct kdbus_item *items, + size_t items_size, + size_t max_policies, + bool allow_wildcards, + const void *owner) +{ + struct kdbus_policy_db_entry_access *a; + struct kdbus_policy_db_entry *e, *p; + const struct kdbus_item *item; + struct hlist_node *tmp; + HLIST_HEAD(entries); + HLIST_HEAD(restore); + size_t count = 0; + int i, ret = 0; + u32 hash; + + /* Walk the list of items and look for new policies */ + e = NULL; + KDBUS_ITEMS_FOREACH(item, items, items_size) { + switch (item->type) { + case KDBUS_ITEM_NAME: { + size_t len; + + if (max_policies && ++count > max_policies) { + ret = -E2BIG; + goto exit; + } + + if (!kdbus_name_is_valid(item->str, true)) { + ret = -EINVAL; + goto exit; + } + + e = kzalloc(sizeof(*e), GFP_KERNEL); + if (!e) { + ret = -ENOMEM; + goto exit; + } + + INIT_LIST_HEAD(&e->access_list); + e->owner = owner; + hlist_add_head(&e->hentry, &entries); + + e->name = kstrdup(item->str, GFP_KERNEL); + if (!e->name) { + ret = -ENOMEM; + goto exit; + } + + /* + * If a supplied name ends with an '.*', cut off that + * part, only store anything before it, and mark the + * entry as wildcard. + */ + len = strlen(e->name); + if (len > 2 && + e->name[len - 3] == '.' && + e->name[len - 2] == '*') { + if (!allow_wildcards) { + ret = -EINVAL; + goto exit; + } + + e->name[len - 3] = '\0'; + e->wildcard = true; + } + + break; + } + + case KDBUS_ITEM_POLICY_ACCESS: + if (!e) { + ret = -EINVAL; + goto exit; + } + + a = kdbus_policy_make_access(&item->policy_access); + if (IS_ERR(a)) { + ret = PTR_ERR(a); + goto exit; + } + + list_add_tail(&a->list, &e->access_list); + break; + } + } + + down_write(&db->entries_rwlock); + + /* remember previous entries to restore in case of failure */ + hash_for_each_safe(db->entries_hash, i, tmp, e, hentry) + if (e->owner == owner) { + hash_del(&e->hentry); + hlist_add_head(&e->hentry, &restore); + } + + hlist_for_each_entry_safe(e, tmp, &entries, hentry) { + /* prevent duplicates */ + hash = kdbus_strhash(e->name); + hash_for_each_possible(db->entries_hash, p, hentry, hash) + if (strcmp(e->name, p->name) == 0 && + e->wildcard == p->wildcard) { + ret = -EEXIST; + goto restore; + } + + hlist_del(&e->hentry); + hash_add(db->entries_hash, &e->hentry, hash); + } + +restore: + /* if we failed, flush all entries we added so far */ + if (ret < 0) + __kdbus_policy_remove_owner(db, owner); + + /* if we failed, restore entries, otherwise release them */ + hlist_for_each_entry_safe(e, tmp, &restore, hentry) { + hlist_del(&e->hentry); + if (ret < 0) { + hash = kdbus_strhash(e->name); + hash_add(db->entries_hash, &e->hentry, hash); + } else { + kdbus_policy_entry_free(e); + } + } + + up_write(&db->entries_rwlock); + +exit: + hlist_for_each_entry_safe(e, tmp, &entries, hentry) { + hlist_del(&e->hentry); + kdbus_policy_entry_free(e); + } + + return ret; +} diff -Nur linux-4.2/ipc/kdbus/policy.h linux-4.2-pck/ipc/kdbus/policy.h --- linux-4.2/ipc/kdbus/policy.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/policy.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_POLICY_H +#define __KDBUS_POLICY_H + +#include +#include + +struct kdbus_conn; +struct kdbus_item; + +/** + * struct kdbus_policy_db - policy database + * @entries_hash: Hashtable of entries + * @entries_rwlock: Mutex to protect the database's access entries + */ +struct kdbus_policy_db { + DECLARE_HASHTABLE(entries_hash, 6); + struct rw_semaphore entries_rwlock; +}; + +void kdbus_policy_db_init(struct kdbus_policy_db *db); +void kdbus_policy_db_clear(struct kdbus_policy_db *db); + +int kdbus_policy_query_unlocked(struct kdbus_policy_db *db, + const struct cred *cred, const char *name, + unsigned int hash); +int kdbus_policy_query(struct kdbus_policy_db *db, const struct cred *cred, + const char *name, unsigned int hash); + +void kdbus_policy_remove_owner(struct kdbus_policy_db *db, + const void *owner); +int kdbus_policy_set(struct kdbus_policy_db *db, + const struct kdbus_item *items, + size_t items_size, + size_t max_policies, + bool allow_wildcards, + const void *owner); + +#endif diff -Nur linux-4.2/ipc/kdbus/pool.c linux-4.2-pck/ipc/kdbus/pool.c --- linux-4.2/ipc/kdbus/pool.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/pool.c 2015-09-08 00:55:40.120721602 -0300 @@ -0,0 +1,728 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pool.h" +#include "util.h" + +/** + * struct kdbus_pool - the receiver's buffer + * @f: The backing shmem file + * @size: The size of the file + * @accounted_size: Currently accounted memory in bytes + * @lock: Pool data lock + * @slices: All slices sorted by address + * @slices_busy: Tree of allocated slices + * @slices_free: Tree of free slices + * + * The receiver's buffer, managed as a pool of allocated and free + * slices containing the queued messages. + * + * Messages sent with KDBUS_CMD_SEND are copied directly by the + * sending process into the receiver's pool. + * + * Messages received with KDBUS_CMD_RECV just return the offset + * to the data placed in the pool. + * + * The internally allocated memory needs to be returned by the receiver + * with KDBUS_CMD_FREE. + */ +struct kdbus_pool { + struct file *f; + size_t size; + size_t accounted_size; + struct mutex lock; + + struct list_head slices; + struct rb_root slices_busy; + struct rb_root slices_free; +}; + +/** + * struct kdbus_pool_slice - allocated element in kdbus_pool + * @pool: Pool this slice belongs to + * @off: Offset of slice in the shmem file + * @size: Size of slice + * @entry: Entry in "all slices" list + * @rb_node: Entry in free or busy list + * @free: Unused slice + * @accounted: Accounted as queue slice + * @ref_kernel: Kernel holds a reference + * @ref_user: Userspace holds a reference + * + * The pool has one or more slices, always spanning the entire size of the + * pool. + * + * Every slice is an element in a list sorted by the buffer address, to + * provide access to the next neighbor slice. + * + * Every slice is member in either the busy or the free tree. The free + * tree is organized by slice size, the busy tree organized by buffer + * offset. + */ +struct kdbus_pool_slice { + struct kdbus_pool *pool; + size_t off; + size_t size; + + struct list_head entry; + struct rb_node rb_node; + + bool free:1; + bool accounted:1; + bool ref_kernel:1; + bool ref_user:1; +}; + +static struct kdbus_pool_slice *kdbus_pool_slice_new(struct kdbus_pool *pool, + size_t off, size_t size) +{ + struct kdbus_pool_slice *slice; + + slice = kzalloc(sizeof(*slice), GFP_KERNEL); + if (!slice) + return NULL; + + slice->pool = pool; + slice->off = off; + slice->size = size; + slice->free = true; + return slice; +} + +/* insert a slice into the free tree */ +static void kdbus_pool_add_free_slice(struct kdbus_pool *pool, + struct kdbus_pool_slice *slice) +{ + struct rb_node **n; + struct rb_node *pn = NULL; + + n = &pool->slices_free.rb_node; + while (*n) { + struct kdbus_pool_slice *pslice; + + pn = *n; + pslice = rb_entry(pn, struct kdbus_pool_slice, rb_node); + if (slice->size < pslice->size) + n = &pn->rb_left; + else + n = &pn->rb_right; + } + + rb_link_node(&slice->rb_node, pn, n); + rb_insert_color(&slice->rb_node, &pool->slices_free); +} + +/* insert a slice into the busy tree */ +static void kdbus_pool_add_busy_slice(struct kdbus_pool *pool, + struct kdbus_pool_slice *slice) +{ + struct rb_node **n; + struct rb_node *pn = NULL; + + n = &pool->slices_busy.rb_node; + while (*n) { + struct kdbus_pool_slice *pslice; + + pn = *n; + pslice = rb_entry(pn, struct kdbus_pool_slice, rb_node); + if (slice->off < pslice->off) + n = &pn->rb_left; + else if (slice->off > pslice->off) + n = &pn->rb_right; + else + BUG(); + } + + rb_link_node(&slice->rb_node, pn, n); + rb_insert_color(&slice->rb_node, &pool->slices_busy); +} + +static struct kdbus_pool_slice *kdbus_pool_find_slice(struct kdbus_pool *pool, + size_t off) +{ + struct rb_node *n; + + n = pool->slices_busy.rb_node; + while (n) { + struct kdbus_pool_slice *s; + + s = rb_entry(n, struct kdbus_pool_slice, rb_node); + if (off < s->off) + n = n->rb_left; + else if (off > s->off) + n = n->rb_right; + else + return s; + } + + return NULL; +} + +/** + * kdbus_pool_slice_alloc() - allocate memory from a pool + * @pool: The receiver's pool + * @size: The number of bytes to allocate + * @accounted: Whether this slice should be accounted for + * + * The returned slice is used for kdbus_pool_slice_release() to + * free the allocated memory. If either @kvec or @iovec is non-NULL, the data + * will be copied from kernel or userspace memory into the new slice at + * offset 0. + * + * Return: the allocated slice on success, ERR_PTR on failure. + */ +struct kdbus_pool_slice *kdbus_pool_slice_alloc(struct kdbus_pool *pool, + size_t size, bool accounted) +{ + size_t slice_size = KDBUS_ALIGN8(size); + struct rb_node *n, *found = NULL; + struct kdbus_pool_slice *s; + int ret = 0; + + if (WARN_ON(!size)) + return ERR_PTR(-EINVAL); + + /* search a free slice with the closest matching size */ + mutex_lock(&pool->lock); + n = pool->slices_free.rb_node; + while (n) { + s = rb_entry(n, struct kdbus_pool_slice, rb_node); + if (slice_size < s->size) { + found = n; + n = n->rb_left; + } else if (slice_size > s->size) { + n = n->rb_right; + } else { + found = n; + break; + } + } + + /* no slice with the minimum size found in the pool */ + if (!found) { + ret = -EXFULL; + goto exit_unlock; + } + + /* no exact match, use the closest one */ + if (!n) { + struct kdbus_pool_slice *s_new; + + s = rb_entry(found, struct kdbus_pool_slice, rb_node); + + /* split-off the remainder of the size to its own slice */ + s_new = kdbus_pool_slice_new(pool, s->off + slice_size, + s->size - slice_size); + if (!s_new) { + ret = -ENOMEM; + goto exit_unlock; + } + + list_add(&s_new->entry, &s->entry); + kdbus_pool_add_free_slice(pool, s_new); + + /* adjust our size now that we split-off another slice */ + s->size = slice_size; + } + + /* move slice from free to the busy tree */ + rb_erase(found, &pool->slices_free); + kdbus_pool_add_busy_slice(pool, s); + + WARN_ON(s->ref_kernel || s->ref_user); + + s->ref_kernel = true; + s->free = false; + s->accounted = accounted; + if (accounted) + pool->accounted_size += s->size; + mutex_unlock(&pool->lock); + + return s; + +exit_unlock: + mutex_unlock(&pool->lock); + return ERR_PTR(ret); +} + +static void __kdbus_pool_slice_release(struct kdbus_pool_slice *slice) +{ + struct kdbus_pool *pool = slice->pool; + + /* don't free the slice if either has a reference */ + if (slice->ref_kernel || slice->ref_user) + return; + + if (WARN_ON(slice->free)) + return; + + rb_erase(&slice->rb_node, &pool->slices_busy); + + /* merge with the next free slice */ + if (!list_is_last(&slice->entry, &pool->slices)) { + struct kdbus_pool_slice *s; + + s = list_entry(slice->entry.next, + struct kdbus_pool_slice, entry); + if (s->free) { + rb_erase(&s->rb_node, &pool->slices_free); + list_del(&s->entry); + slice->size += s->size; + kfree(s); + } + } + + /* merge with previous free slice */ + if (pool->slices.next != &slice->entry) { + struct kdbus_pool_slice *s; + + s = list_entry(slice->entry.prev, + struct kdbus_pool_slice, entry); + if (s->free) { + rb_erase(&s->rb_node, &pool->slices_free); + list_del(&slice->entry); + s->size += slice->size; + kfree(slice); + slice = s; + } + } + + slice->free = true; + kdbus_pool_add_free_slice(pool, slice); +} + +/** + * kdbus_pool_slice_release() - drop kernel-reference on allocated slice + * @slice: Slice allocated from the pool + * + * This releases the kernel-reference on the given slice. If the + * kernel-reference and the user-reference on a slice are dropped, the slice is + * returned to the pool. + * + * So far, we do not implement full ref-counting on slices. Each, kernel and + * user-space can have exactly one reference to a slice. If both are dropped at + * the same time, the slice is released. + */ +void kdbus_pool_slice_release(struct kdbus_pool_slice *slice) +{ + struct kdbus_pool *pool; + + if (!slice) + return; + + /* @slice may be freed, so keep local ptr to @pool */ + pool = slice->pool; + + mutex_lock(&pool->lock); + /* kernel must own a ref to @slice to drop it */ + WARN_ON(!slice->ref_kernel); + slice->ref_kernel = false; + /* no longer kernel-owned, de-account slice */ + if (slice->accounted && !WARN_ON(pool->accounted_size < slice->size)) + pool->accounted_size -= slice->size; + __kdbus_pool_slice_release(slice); + mutex_unlock(&pool->lock); +} + +/** + * kdbus_pool_release_offset() - release a public offset + * @pool: pool to operate on + * @off: offset to release + * + * This should be called whenever user-space frees a slice given to them. It + * verifies the slice is available and public, and then drops it. It ensures + * correct locking and barriers against queues. + * + * Return: 0 on success, ENXIO if the offset is invalid or not public. + */ +int kdbus_pool_release_offset(struct kdbus_pool *pool, size_t off) +{ + struct kdbus_pool_slice *slice; + int ret = 0; + + /* 'pool->size' is used as dummy offset for empty slices */ + if (off == pool->size) + return 0; + + mutex_lock(&pool->lock); + slice = kdbus_pool_find_slice(pool, off); + if (slice && slice->ref_user) { + slice->ref_user = false; + __kdbus_pool_slice_release(slice); + } else { + ret = -ENXIO; + } + mutex_unlock(&pool->lock); + + return ret; +} + +/** + * kdbus_pool_publish_empty() - publish empty slice to user-space + * @pool: pool to operate on + * @off: output storage for offset, or NULL + * @size: output storage for size, or NULL + * + * This is the same as kdbus_pool_slice_publish(), but uses a dummy slice with + * size 0. The returned offset points to the end of the pool and is never + * returned on real slices. + */ +void kdbus_pool_publish_empty(struct kdbus_pool *pool, u64 *off, u64 *size) +{ + if (off) + *off = pool->size; + if (size) + *size = 0; +} + +/** + * kdbus_pool_slice_publish() - publish slice to user-space + * @slice: The slice + * @out_offset: Output storage for offset, or NULL + * @out_size: Output storage for size, or NULL + * + * This prepares a slice to be published to user-space. + * + * This call combines the following operations: + * * the memory region is flushed so the user's memory view is consistent + * * the slice is marked as referenced by user-space, so user-space has to + * call KDBUS_CMD_FREE to release it + * * the offset and size of the slice are written to the given output + * arguments, if non-NULL + */ +void kdbus_pool_slice_publish(struct kdbus_pool_slice *slice, + u64 *out_offset, u64 *out_size) +{ + mutex_lock(&slice->pool->lock); + /* kernel must own a ref to @slice to gain a user-space ref */ + WARN_ON(!slice->ref_kernel); + slice->ref_user = true; + mutex_unlock(&slice->pool->lock); + + if (out_offset) + *out_offset = slice->off; + if (out_size) + *out_size = slice->size; +} + +/** + * kdbus_pool_slice_offset() - Get a slice's offset inside the pool + * @slice: Slice to return the offset of + * + * Return: The internal offset @slice inside the pool. + */ +off_t kdbus_pool_slice_offset(const struct kdbus_pool_slice *slice) +{ + return slice->off; +} + +/** + * kdbus_pool_slice_size() - get size of a pool slice + * @slice: slice to query + * + * Return: size of the given slice + */ +size_t kdbus_pool_slice_size(const struct kdbus_pool_slice *slice) +{ + return slice->size; +} + +/** + * kdbus_pool_new() - create a new pool + * @name: Name of the (deleted) file which shows up in + * /proc, used for debugging + * @size: Maximum size of the pool + * + * Return: a new kdbus_pool on success, ERR_PTR on failure. + */ +struct kdbus_pool *kdbus_pool_new(const char *name, size_t size) +{ + struct kdbus_pool_slice *s; + struct kdbus_pool *p; + struct file *f; + char *n = NULL; + int ret; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + if (name) { + n = kasprintf(GFP_KERNEL, KBUILD_MODNAME "-conn:%s", name); + if (!n) { + ret = -ENOMEM; + goto exit_free; + } + } + + f = shmem_file_setup(n ?: KBUILD_MODNAME "-conn", size, 0, 0); + kfree(n); + + if (IS_ERR(f)) { + ret = PTR_ERR(f); + goto exit_free; + } + + ret = get_write_access(file_inode(f)); + if (ret < 0) + goto exit_put_shmem; + + /* allocate first slice spanning the entire pool */ + s = kdbus_pool_slice_new(p, 0, size); + if (!s) { + ret = -ENOMEM; + goto exit_put_write; + } + + p->f = f; + p->size = size; + p->slices_free = RB_ROOT; + p->slices_busy = RB_ROOT; + mutex_init(&p->lock); + + INIT_LIST_HEAD(&p->slices); + list_add(&s->entry, &p->slices); + + kdbus_pool_add_free_slice(p, s); + return p; + +exit_put_write: + put_write_access(file_inode(f)); +exit_put_shmem: + fput(f); +exit_free: + kfree(p); + return ERR_PTR(ret); +} + +/** + * kdbus_pool_free() - destroy pool + * @pool: The receiver's pool + */ +void kdbus_pool_free(struct kdbus_pool *pool) +{ + struct kdbus_pool_slice *s, *tmp; + + if (!pool) + return; + + list_for_each_entry_safe(s, tmp, &pool->slices, entry) { + list_del(&s->entry); + kfree(s); + } + + put_write_access(file_inode(pool->f)); + fput(pool->f); + kfree(pool); +} + +/** + * kdbus_pool_accounted() - retrieve accounting information + * @pool: pool to query + * @size: output for overall pool size + * @acc: output for currently accounted size + * + * This returns accounting information of the pool. Note that the data might + * change after the function returns, as the pool lock is dropped. You need to + * protect the data via other means, if you need reliable accounting. + */ +void kdbus_pool_accounted(struct kdbus_pool *pool, size_t *size, size_t *acc) +{ + mutex_lock(&pool->lock); + if (size) + *size = pool->size; + if (acc) + *acc = pool->accounted_size; + mutex_unlock(&pool->lock); +} + +/** + * kdbus_pool_slice_copy_iovec() - copy user memory to a slice + * @slice: The slice to write to + * @off: Offset in the slice to write to + * @iov: iovec array, pointing to data to copy + * @iov_len: Number of elements in @iov + * @total_len: Total number of bytes described in members of @iov + * + * User memory referenced by @iov will be copied into @slice at offset @off. + * + * Return: the numbers of bytes copied, negative errno on failure. + */ +ssize_t +kdbus_pool_slice_copy_iovec(const struct kdbus_pool_slice *slice, loff_t off, + struct iovec *iov, size_t iov_len, size_t total_len) +{ + struct iov_iter iter; + ssize_t len; + + if (WARN_ON(off + total_len > slice->size)) + return -EFAULT; + + off += slice->off; + iov_iter_init(&iter, WRITE, iov, iov_len, total_len); + len = vfs_iter_write(slice->pool->f, &iter, &off); + + return (len >= 0 && len != total_len) ? -EFAULT : len; +} + +/** + * kdbus_pool_slice_copy_kvec() - copy kernel memory to a slice + * @slice: The slice to write to + * @off: Offset in the slice to write to + * @kvec: kvec array, pointing to data to copy + * @kvec_len: Number of elements in @kvec + * @total_len: Total number of bytes described in members of @kvec + * + * Kernel memory referenced by @kvec will be copied into @slice at offset @off. + * + * Return: the numbers of bytes copied, negative errno on failure. + */ +ssize_t kdbus_pool_slice_copy_kvec(const struct kdbus_pool_slice *slice, + loff_t off, struct kvec *kvec, + size_t kvec_len, size_t total_len) +{ + struct iov_iter iter; + mm_segment_t old_fs; + ssize_t len; + + if (WARN_ON(off + total_len > slice->size)) + return -EFAULT; + + off += slice->off; + iov_iter_kvec(&iter, WRITE | ITER_KVEC, kvec, kvec_len, total_len); + + old_fs = get_fs(); + set_fs(get_ds()); + len = vfs_iter_write(slice->pool->f, &iter, &off); + set_fs(old_fs); + + return (len >= 0 && len != total_len) ? -EFAULT : len; +} + +/** + * kdbus_pool_slice_copy() - copy data from one slice into another + * @slice_dst: destination slice + * @slice_src: source slice + * + * Return: 0 on success, negative error number on failure. + */ +int kdbus_pool_slice_copy(const struct kdbus_pool_slice *slice_dst, + const struct kdbus_pool_slice *slice_src) +{ + struct file *f_src = slice_src->pool->f; + struct file *f_dst = slice_dst->pool->f; + struct inode *i_dst = file_inode(f_dst); + struct address_space *mapping_dst = f_dst->f_mapping; + const struct address_space_operations *aops = mapping_dst->a_ops; + unsigned long len = slice_src->size; + loff_t off_src = slice_src->off; + loff_t off_dst = slice_dst->off; + mm_segment_t old_fs; + int ret = 0; + + if (WARN_ON(slice_src->size != slice_dst->size) || + WARN_ON(slice_src->free || slice_dst->free)) + return -EINVAL; + + mutex_lock(&i_dst->i_mutex); + old_fs = get_fs(); + set_fs(get_ds()); + while (len > 0) { + unsigned long page_off; + unsigned long copy_len; + char __user *kaddr; + struct page *page; + ssize_t n_read; + void *fsdata; + long status; + + page_off = off_dst & (PAGE_CACHE_SIZE - 1); + copy_len = min_t(unsigned long, + PAGE_CACHE_SIZE - page_off, len); + + status = aops->write_begin(f_dst, mapping_dst, off_dst, + copy_len, 0, &page, &fsdata); + if (unlikely(status < 0)) { + ret = status; + break; + } + + kaddr = (char __force __user *)kmap(page) + page_off; + n_read = __vfs_read(f_src, kaddr, copy_len, &off_src); + kunmap(page); + mark_page_accessed(page); + flush_dcache_page(page); + + if (unlikely(n_read != copy_len)) { + ret = -EFAULT; + break; + } + + status = aops->write_end(f_dst, mapping_dst, off_dst, + copy_len, copy_len, page, fsdata); + if (unlikely(status != copy_len)) { + ret = -EFAULT; + break; + } + + off_dst += copy_len; + len -= copy_len; + } + set_fs(old_fs); + mutex_unlock(&i_dst->i_mutex); + + return ret; +} + +/** + * kdbus_pool_mmap() - map the pool into the process + * @pool: The receiver's pool + * @vma: passed by mmap() syscall + * + * Return: the result of the mmap() call, negative errno on failure. + */ +int kdbus_pool_mmap(const struct kdbus_pool *pool, struct vm_area_struct *vma) +{ + /* deny write access to the pool */ + if (vma->vm_flags & VM_WRITE) + return -EPERM; + vma->vm_flags &= ~VM_MAYWRITE; + + /* do not allow to map more than the size of the file */ + if ((vma->vm_end - vma->vm_start) > pool->size) + return -EFAULT; + + /* replace the connection file with our shmem file */ + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = get_file(pool->f); + + return pool->f->f_op->mmap(pool->f, vma); +} diff -Nur linux-4.2/ipc/kdbus/pool.h linux-4.2-pck/ipc/kdbus/pool.h --- linux-4.2/ipc/kdbus/pool.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/pool.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_POOL_H +#define __KDBUS_POOL_H + +#include + +struct kdbus_pool; +struct kdbus_pool_slice; + +struct kdbus_pool *kdbus_pool_new(const char *name, size_t size); +void kdbus_pool_free(struct kdbus_pool *pool); +void kdbus_pool_accounted(struct kdbus_pool *pool, size_t *size, size_t *acc); +int kdbus_pool_mmap(const struct kdbus_pool *pool, struct vm_area_struct *vma); +int kdbus_pool_release_offset(struct kdbus_pool *pool, size_t off); +void kdbus_pool_publish_empty(struct kdbus_pool *pool, u64 *off, u64 *size); + +struct kdbus_pool_slice *kdbus_pool_slice_alloc(struct kdbus_pool *pool, + size_t size, bool accounted); +void kdbus_pool_slice_release(struct kdbus_pool_slice *slice); +void kdbus_pool_slice_publish(struct kdbus_pool_slice *slice, + u64 *out_offset, u64 *out_size); +off_t kdbus_pool_slice_offset(const struct kdbus_pool_slice *slice); +size_t kdbus_pool_slice_size(const struct kdbus_pool_slice *slice); +int kdbus_pool_slice_copy(const struct kdbus_pool_slice *slice_dst, + const struct kdbus_pool_slice *slice_src); +ssize_t kdbus_pool_slice_copy_kvec(const struct kdbus_pool_slice *slice, + loff_t off, struct kvec *kvec, + size_t kvec_count, size_t total_len); +ssize_t kdbus_pool_slice_copy_iovec(const struct kdbus_pool_slice *slice, + loff_t off, struct iovec *iov, + size_t iov_count, size_t total_len); + +#endif diff -Nur linux-4.2/ipc/kdbus/queue.c linux-4.2-pck/ipc/kdbus/queue.c --- linux-4.2/ipc/kdbus/queue.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/queue.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,363 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "domain.h" +#include "connection.h" +#include "item.h" +#include "message.h" +#include "metadata.h" +#include "queue.h" +#include "reply.h" + +/** + * kdbus_queue_init() - initialize data structure related to a queue + * @queue: The queue to initialize + */ +void kdbus_queue_init(struct kdbus_queue *queue) +{ + INIT_LIST_HEAD(&queue->msg_list); + queue->msg_prio_queue = RB_ROOT; +} + +/** + * kdbus_queue_peek() - Retrieves an entry from a queue + * @queue: The queue + * @priority: The minimum priority of the entry to peek + * @use_priority: Boolean flag whether or not to peek by priority + * + * Look for a entry in a queue, either by priority, or the oldest one (FIFO). + * The entry is not freed, put off the queue's lists or anything else. + * + * Return: the peeked queue entry on success, NULL if no suitable msg is found + */ +struct kdbus_queue_entry *kdbus_queue_peek(struct kdbus_queue *queue, + s64 priority, bool use_priority) +{ + struct kdbus_queue_entry *e; + + if (list_empty(&queue->msg_list)) + return NULL; + + if (use_priority) { + /* get next entry with highest priority */ + e = rb_entry(queue->msg_prio_highest, + struct kdbus_queue_entry, prio_node); + + /* no entry with the requested priority */ + if (e->priority > priority) + return NULL; + } else { + /* ignore the priority, return the next entry in the entry */ + e = list_first_entry(&queue->msg_list, + struct kdbus_queue_entry, entry); + } + + return e; +} + +static void kdbus_queue_entry_link(struct kdbus_queue_entry *entry) +{ + struct kdbus_queue *queue = &entry->conn->queue; + struct rb_node **n, *pn = NULL; + bool highest = true; + + lockdep_assert_held(&entry->conn->lock); + if (WARN_ON(!list_empty(&entry->entry))) + return; + + /* sort into priority entry tree */ + n = &queue->msg_prio_queue.rb_node; + while (*n) { + struct kdbus_queue_entry *e; + + pn = *n; + e = rb_entry(pn, struct kdbus_queue_entry, prio_node); + + /* existing node for this priority, add to its list */ + if (likely(entry->priority == e->priority)) { + list_add_tail(&entry->prio_entry, &e->prio_entry); + goto prio_done; + } + + if (entry->priority < e->priority) { + n = &pn->rb_left; + } else { + n = &pn->rb_right; + highest = false; + } + } + + /* cache highest-priority entry */ + if (highest) + queue->msg_prio_highest = &entry->prio_node; + + /* new node for this priority */ + rb_link_node(&entry->prio_node, pn, n); + rb_insert_color(&entry->prio_node, &queue->msg_prio_queue); + INIT_LIST_HEAD(&entry->prio_entry); + +prio_done: + /* add to unsorted fifo list */ + list_add_tail(&entry->entry, &queue->msg_list); +} + +static void kdbus_queue_entry_unlink(struct kdbus_queue_entry *entry) +{ + struct kdbus_queue *queue = &entry->conn->queue; + + lockdep_assert_held(&entry->conn->lock); + if (list_empty(&entry->entry)) + return; + + list_del_init(&entry->entry); + + if (list_empty(&entry->prio_entry)) { + /* + * Single entry for this priority, update cached + * highest-priority entry, remove the tree node. + */ + if (queue->msg_prio_highest == &entry->prio_node) + queue->msg_prio_highest = rb_next(&entry->prio_node); + + rb_erase(&entry->prio_node, &queue->msg_prio_queue); + } else { + struct kdbus_queue_entry *q; + + /* + * Multiple entries for this priority entry, get next one in + * the list. Update cached highest-priority entry, store the + * new one as the tree node. + */ + q = list_first_entry(&entry->prio_entry, + struct kdbus_queue_entry, prio_entry); + list_del(&entry->prio_entry); + + if (queue->msg_prio_highest == &entry->prio_node) + queue->msg_prio_highest = &q->prio_node; + + rb_replace_node(&entry->prio_node, &q->prio_node, + &queue->msg_prio_queue); + } +} + +/** + * kdbus_queue_entry_new() - allocate a queue entry + * @src: source connection, or NULL + * @dst: destination connection + * @s: staging object carrying the message + * + * Allocates a queue entry based on a given msg and allocate space for + * the message payload and the requested metadata in the connection's pool. + * The entry is not actually added to the queue's lists at this point. + * + * Return: the allocated entry on success, or an ERR_PTR on failures. + */ +struct kdbus_queue_entry *kdbus_queue_entry_new(struct kdbus_conn *src, + struct kdbus_conn *dst, + struct kdbus_staging *s) +{ + struct kdbus_queue_entry *entry; + int ret; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&entry->entry); + entry->priority = s->msg->priority; + entry->conn = kdbus_conn_ref(dst); + entry->gaps = kdbus_gaps_ref(s->gaps); + + entry->slice = kdbus_staging_emit(s, src, dst); + if (IS_ERR(entry->slice)) { + ret = PTR_ERR(entry->slice); + entry->slice = NULL; + goto error; + } + + entry->user = src ? kdbus_user_ref(src->user) : NULL; + return entry; + +error: + kdbus_queue_entry_free(entry); + return ERR_PTR(ret); +} + +/** + * kdbus_queue_entry_free() - free resources of an entry + * @entry: The entry to free + * + * Removes resources allocated by a queue entry, along with the entry itself. + * Note that the entry's slice is not freed at this point. + */ +void kdbus_queue_entry_free(struct kdbus_queue_entry *entry) +{ + if (!entry) + return; + + lockdep_assert_held(&entry->conn->lock); + + kdbus_queue_entry_unlink(entry); + kdbus_reply_unref(entry->reply); + + if (entry->slice) { + kdbus_conn_quota_dec(entry->conn, entry->user, + kdbus_pool_slice_size(entry->slice), + entry->gaps ? entry->gaps->n_fds : 0); + kdbus_pool_slice_release(entry->slice); + } + + kdbus_user_unref(entry->user); + kdbus_gaps_unref(entry->gaps); + kdbus_conn_unref(entry->conn); + kfree(entry); +} + +/** + * kdbus_queue_entry_install() - install message components into the + * receiver's process + * @entry: The queue entry to install + * @return_flags: Pointer to store the return flags for userspace + * @install_fds: Whether or not to install associated file descriptors + * + * Return: 0 on success. + */ +int kdbus_queue_entry_install(struct kdbus_queue_entry *entry, + u64 *return_flags, bool install_fds) +{ + bool incomplete_fds = false; + int ret; + + lockdep_assert_held(&entry->conn->lock); + + ret = kdbus_gaps_install(entry->gaps, entry->slice, &incomplete_fds); + if (ret < 0) + return ret; + + if (incomplete_fds) + *return_flags |= KDBUS_RECV_RETURN_INCOMPLETE_FDS; + return 0; +} + +/** + * kdbus_queue_entry_enqueue() - enqueue an entry + * @entry: entry to enqueue + * @reply: reply to link to this entry (or NULL if none) + * + * This enqueues an unqueued entry into the message queue of the linked + * connection. It also binds a reply object to the entry so we can remember it + * when the message is moved. + * + * Once this call returns (and the connection lock is released), this entry can + * be dequeued by the target connection. Note that the entry will not be removed + * from the queue until it is destroyed. + */ +void kdbus_queue_entry_enqueue(struct kdbus_queue_entry *entry, + struct kdbus_reply *reply) +{ + lockdep_assert_held(&entry->conn->lock); + + if (WARN_ON(entry->reply) || WARN_ON(!list_empty(&entry->entry))) + return; + + entry->reply = kdbus_reply_ref(reply); + kdbus_queue_entry_link(entry); +} + +/** + * kdbus_queue_entry_move() - move queue entry + * @e: queue entry to move + * @dst: destination connection to queue the entry on + * + * This moves a queue entry onto a different connection. It allocates a new + * slice on the target connection and copies the message over. If the copy + * succeeded, we move the entry from @src to @dst. + * + * On failure, the entry is left untouched. + * + * The queue entry must be queued right now, and after the call succeeds it will + * be queued on the destination, but no longer on the source. + * + * The caller must hold the connection lock of the source *and* destination. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_queue_entry_move(struct kdbus_queue_entry *e, + struct kdbus_conn *dst) +{ + struct kdbus_pool_slice *slice = NULL; + struct kdbus_conn *src = e->conn; + size_t size, fds; + int ret; + + lockdep_assert_held(&src->lock); + lockdep_assert_held(&dst->lock); + + if (WARN_ON(list_empty(&e->entry))) + return -EINVAL; + if (src == dst) + return 0; + + size = kdbus_pool_slice_size(e->slice); + fds = e->gaps ? e->gaps->n_fds : 0; + + ret = kdbus_conn_quota_inc(dst, e->user, size, fds); + if (ret < 0) + return ret; + + slice = kdbus_pool_slice_alloc(dst->pool, size, true); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto error; + } + + ret = kdbus_pool_slice_copy(slice, e->slice); + if (ret < 0) + goto error; + + kdbus_queue_entry_unlink(e); + kdbus_conn_quota_dec(src, e->user, size, fds); + kdbus_pool_slice_release(e->slice); + kdbus_conn_unref(e->conn); + + e->slice = slice; + e->conn = kdbus_conn_ref(dst); + kdbus_queue_entry_link(e); + + return 0; + +error: + kdbus_pool_slice_release(slice); + kdbus_conn_quota_dec(dst, e->user, size, fds); + return ret; +} diff -Nur linux-4.2/ipc/kdbus/queue.h linux-4.2-pck/ipc/kdbus/queue.h --- linux-4.2/ipc/kdbus/queue.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/queue.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_QUEUE_H +#define __KDBUS_QUEUE_H + +#include +#include + +struct kdbus_conn; +struct kdbus_pool_slice; +struct kdbus_reply; +struct kdbus_staging; +struct kdbus_user; + +/** + * struct kdbus_queue - a connection's message queue + * @msg_list: List head for kdbus_queue_entry objects + * @msg_prio_queue: RB tree root for messages, sorted by priority + * @msg_prio_highest: Link to the RB node referencing the message with the + * highest priority in the tree. + */ +struct kdbus_queue { + struct list_head msg_list; + struct rb_root msg_prio_queue; + struct rb_node *msg_prio_highest; +}; + +/** + * struct kdbus_queue_entry - messages waiting to be read + * @entry: Entry in the connection's list + * @prio_node: Entry in the priority queue tree + * @prio_entry: Queue tree node entry in the list of one priority + * @priority: Message priority + * @dst_name_id: The sequence number of the name this message is + * addressed to, 0 for messages sent to an ID + * @conn: Connection this entry is queued on + * @gaps: Gaps object to fill message gaps at RECV time + * @user: User used for accounting + * @slice: Slice in the receiver's pool for the message + * @reply: The reply block if a reply to this message is expected + */ +struct kdbus_queue_entry { + struct list_head entry; + struct rb_node prio_node; + struct list_head prio_entry; + + s64 priority; + u64 dst_name_id; + + struct kdbus_conn *conn; + struct kdbus_gaps *gaps; + struct kdbus_user *user; + struct kdbus_pool_slice *slice; + struct kdbus_reply *reply; +}; + +void kdbus_queue_init(struct kdbus_queue *queue); +struct kdbus_queue_entry *kdbus_queue_peek(struct kdbus_queue *queue, + s64 priority, bool use_priority); + +struct kdbus_queue_entry *kdbus_queue_entry_new(struct kdbus_conn *src, + struct kdbus_conn *dst, + struct kdbus_staging *s); +void kdbus_queue_entry_free(struct kdbus_queue_entry *entry); +int kdbus_queue_entry_install(struct kdbus_queue_entry *entry, + u64 *return_flags, bool install_fds); +void kdbus_queue_entry_enqueue(struct kdbus_queue_entry *entry, + struct kdbus_reply *reply); +int kdbus_queue_entry_move(struct kdbus_queue_entry *entry, + struct kdbus_conn *dst); + +#endif /* __KDBUS_QUEUE_H */ diff -Nur linux-4.2/ipc/kdbus/reply.c linux-4.2-pck/ipc/kdbus/reply.c --- linux-4.2/ipc/kdbus/reply.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/reply.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,252 @@ +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "message.h" +#include "metadata.h" +#include "names.h" +#include "domain.h" +#include "item.h" +#include "notify.h" +#include "policy.h" +#include "reply.h" +#include "util.h" + +/** + * kdbus_reply_new() - Allocate and set up a new kdbus_reply object + * @reply_src: The connection a reply is expected from + * @reply_dst: The connection this reply object belongs to + * @msg: Message associated with the reply + * @name_entry: Name entry used to send the message + * @sync: Whether or not to make this reply synchronous + * + * Allocate and fill a new kdbus_reply object. + * + * Return: New kdbus_conn object on success, ERR_PTR on error. + */ +struct kdbus_reply *kdbus_reply_new(struct kdbus_conn *reply_src, + struct kdbus_conn *reply_dst, + const struct kdbus_msg *msg, + struct kdbus_name_entry *name_entry, + bool sync) +{ + struct kdbus_reply *r; + int ret; + + if (atomic_inc_return(&reply_dst->request_count) > + KDBUS_CONN_MAX_REQUESTS_PENDING) { + ret = -EMLINK; + goto exit_dec_request_count; + } + + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) { + ret = -ENOMEM; + goto exit_dec_request_count; + } + + kref_init(&r->kref); + INIT_LIST_HEAD(&r->entry); + r->reply_src = kdbus_conn_ref(reply_src); + r->reply_dst = kdbus_conn_ref(reply_dst); + r->cookie = msg->cookie; + r->name_id = name_entry ? name_entry->name_id : 0; + r->deadline_ns = msg->timeout_ns; + + if (sync) { + r->sync = true; + r->waiting = true; + } + + return r; + +exit_dec_request_count: + atomic_dec(&reply_dst->request_count); + return ERR_PTR(ret); +} + +static void __kdbus_reply_free(struct kref *kref) +{ + struct kdbus_reply *reply = + container_of(kref, struct kdbus_reply, kref); + + atomic_dec(&reply->reply_dst->request_count); + kdbus_conn_unref(reply->reply_src); + kdbus_conn_unref(reply->reply_dst); + kfree(reply); +} + +/** + * kdbus_reply_ref() - Increase reference on kdbus_reply + * @r: The reply, may be %NULL + * + * Return: The reply object with an extra reference + */ +struct kdbus_reply *kdbus_reply_ref(struct kdbus_reply *r) +{ + if (r) + kref_get(&r->kref); + return r; +} + +/** + * kdbus_reply_unref() - Decrease reference on kdbus_reply + * @r: The reply, may be %NULL + * + * Return: NULL + */ +struct kdbus_reply *kdbus_reply_unref(struct kdbus_reply *r) +{ + if (r) + kref_put(&r->kref, __kdbus_reply_free); + return NULL; +} + +/** + * kdbus_reply_link() - Link reply object into target connection + * @r: Reply to link + */ +void kdbus_reply_link(struct kdbus_reply *r) +{ + if (WARN_ON(!list_empty(&r->entry))) + return; + + list_add(&r->entry, &r->reply_dst->reply_list); + kdbus_reply_ref(r); +} + +/** + * kdbus_reply_unlink() - Unlink reply object from target connection + * @r: Reply to unlink + */ +void kdbus_reply_unlink(struct kdbus_reply *r) +{ + if (!list_empty(&r->entry)) { + list_del_init(&r->entry); + kdbus_reply_unref(r); + } +} + +/** + * kdbus_sync_reply_wakeup() - Wake a synchronously blocking reply + * @reply: The reply object + * @err: Error code to set on the remote side + * + * Wake up remote peer (method origin) with the appropriate synchronous reply + * code. + */ +void kdbus_sync_reply_wakeup(struct kdbus_reply *reply, int err) +{ + if (WARN_ON(!reply->sync)) + return; + + reply->waiting = false; + reply->err = err; + wake_up_interruptible(&reply->reply_dst->wait); +} + +/** + * kdbus_reply_find() - Find the corresponding reply object + * @replying: The replying connection or NULL + * @reply_dst: The connection the reply will be sent to + * (method origin) + * @cookie: The cookie of the requesting message + * + * Lookup a reply object that should be sent as a reply by + * @replying to @reply_dst with the given cookie. + * + * Callers must take the @reply_dst lock. + * + * Return: the corresponding reply object or NULL if not found + */ +struct kdbus_reply *kdbus_reply_find(struct kdbus_conn *replying, + struct kdbus_conn *reply_dst, + u64 cookie) +{ + struct kdbus_reply *r; + + list_for_each_entry(r, &reply_dst->reply_list, entry) { + if (r->cookie == cookie && + (!replying || r->reply_src == replying)) + return r; + } + + return NULL; +} + +/** + * kdbus_reply_list_scan_work() - Worker callback to scan the replies of a + * connection for exceeded timeouts + * @work: Work struct of the connection to scan + * + * Walk the list of replies stored with a connection and look for entries + * that have exceeded their timeout. If such an entry is found, a timeout + * notification is sent to the waiting peer, and the reply is removed from + * the list. + * + * The work is rescheduled to the nearest timeout found during the list + * iteration. + */ +void kdbus_reply_list_scan_work(struct work_struct *work) +{ + struct kdbus_conn *conn = + container_of(work, struct kdbus_conn, work.work); + struct kdbus_reply *reply, *reply_tmp; + u64 deadline = ~0ULL; + u64 now; + + now = ktime_get_ns(); + + mutex_lock(&conn->lock); + if (!kdbus_conn_active(conn)) { + mutex_unlock(&conn->lock); + return; + } + + list_for_each_entry_safe(reply, reply_tmp, &conn->reply_list, entry) { + /* + * If the reply block is waiting for synchronous I/O, + * the timeout is handled by wait_event_*_timeout(), + * so we don't have to care for it here. + */ + if (reply->sync && !reply->interrupted) + continue; + + WARN_ON(reply->reply_dst != conn); + + if (reply->deadline_ns > now) { + /* remember next timeout */ + if (deadline > reply->deadline_ns) + deadline = reply->deadline_ns; + + continue; + } + + /* + * A zero deadline means the connection died, was + * cleaned up already and the notification was sent. + * Don't send notifications for reply trackers that were + * left in an interrupted syscall state. + */ + if (reply->deadline_ns != 0 && !reply->interrupted) + kdbus_notify_reply_timeout(conn->ep->bus, conn->id, + reply->cookie); + + kdbus_reply_unlink(reply); + } + + /* rearm delayed work with next timeout */ + if (deadline != ~0ULL) + schedule_delayed_work(&conn->work, + nsecs_to_jiffies(deadline - now)); + + mutex_unlock(&conn->lock); + + kdbus_notify_flush(conn->ep->bus); +} diff -Nur linux-4.2/ipc/kdbus/reply.h linux-4.2-pck/ipc/kdbus/reply.h --- linux-4.2/ipc/kdbus/reply.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/reply.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_REPLY_H +#define __KDBUS_REPLY_H + +/** + * struct kdbus_reply - an entry of kdbus_conn's list of replies + * @kref: Ref-count of this object + * @entry: The entry of the connection's reply_list + * @reply_src: The connection the reply will be sent from + * @reply_dst: The connection the reply will be sent to + * @queue_entry: The queue entry item that is prepared by the replying + * connection + * @deadline_ns: The deadline of the reply, in nanoseconds + * @cookie: The cookie of the requesting message + * @name_id: ID of the well-known name the original msg was sent to + * @sync: The reply block is waiting for synchronous I/O + * @waiting: The condition to synchronously wait for + * @interrupted: The sync reply was left in an interrupted state + * @err: The error code for the synchronous reply + */ +struct kdbus_reply { + struct kref kref; + struct list_head entry; + struct kdbus_conn *reply_src; + struct kdbus_conn *reply_dst; + struct kdbus_queue_entry *queue_entry; + u64 deadline_ns; + u64 cookie; + u64 name_id; + bool sync:1; + bool waiting:1; + bool interrupted:1; + int err; +}; + +struct kdbus_reply *kdbus_reply_new(struct kdbus_conn *reply_src, + struct kdbus_conn *reply_dst, + const struct kdbus_msg *msg, + struct kdbus_name_entry *name_entry, + bool sync); + +struct kdbus_reply *kdbus_reply_ref(struct kdbus_reply *r); +struct kdbus_reply *kdbus_reply_unref(struct kdbus_reply *r); + +void kdbus_reply_link(struct kdbus_reply *r); +void kdbus_reply_unlink(struct kdbus_reply *r); + +struct kdbus_reply *kdbus_reply_find(struct kdbus_conn *replying, + struct kdbus_conn *reply_dst, + u64 cookie); + +void kdbus_sync_reply_wakeup(struct kdbus_reply *reply, int err); +void kdbus_reply_list_scan_work(struct work_struct *work); + +#endif /* __KDBUS_REPLY_H */ diff -Nur linux-4.2/ipc/kdbus/util.c linux-4.2-pck/ipc/kdbus/util.c --- linux-4.2/ipc/kdbus/util.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/util.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "limits.h" +#include "util.h" + +/** + * kdbus_copy_from_user() - copy aligned data from user-space + * @dest: target buffer in kernel memory + * @user_ptr: user-provided source buffer + * @size: memory size to copy from user + * + * This copies @size bytes from @user_ptr into the kernel, just like + * copy_from_user() does. But we enforce an 8-byte alignment and reject any + * unaligned user-space pointers. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_copy_from_user(void *dest, void __user *user_ptr, size_t size) +{ + if (!KDBUS_IS_ALIGNED8((uintptr_t)user_ptr)) + return -EFAULT; + + if (copy_from_user(dest, user_ptr, size)) + return -EFAULT; + + return 0; +} + +/** + * kdbus_verify_uid_prefix() - verify UID prefix of a user-supplied name + * @name: user-supplied name to verify + * @user_ns: user-namespace to act in + * @kuid: Kernel internal uid of user + * + * This verifies that the user-supplied name @name has their UID as prefix. This + * is the default name-spacing policy we enforce on user-supplied names for + * public kdbus entities like buses and endpoints. + * + * The user must supply names prefixed with "-", whereas the UID is + * interpreted in the user-namespace of the domain. If the user fails to supply + * such a prefixed name, we reject it. + * + * Return: 0 on success, negative error code on failure + */ +int kdbus_verify_uid_prefix(const char *name, struct user_namespace *user_ns, + kuid_t kuid) +{ + uid_t uid; + char prefix[16]; + + /* + * The kuid must have a mapping into the userns of the domain + * otherwise do not allow creation of buses nor endpoints. + */ + uid = from_kuid(user_ns, kuid); + if (uid == (uid_t) -1) + return -EINVAL; + + snprintf(prefix, sizeof(prefix), "%u-", uid); + if (strncmp(name, prefix, strlen(prefix)) != 0) + return -EINVAL; + + return 0; +} + +/** + * kdbus_sanitize_attach_flags() - Sanitize attach flags from user-space + * @flags: Attach flags provided by userspace + * @attach_flags: A pointer where to store the valid attach flags + * + * Convert attach-flags provided by user-space into a valid mask. If the mask + * is invalid, an error is returned. The sanitized attach flags are stored in + * the output parameter. + * + * Return: 0 on success, negative error on failure. + */ +int kdbus_sanitize_attach_flags(u64 flags, u64 *attach_flags) +{ + /* 'any' degrades to 'all' for compatibility */ + if (flags == _KDBUS_ATTACH_ANY) + flags = _KDBUS_ATTACH_ALL; + + /* reject unknown attach flags */ + if (flags & ~_KDBUS_ATTACH_ALL) + return -EINVAL; + + *attach_flags = flags; + return 0; +} + +/** + * kdbus_kvec_set - helper utility to assemble kvec arrays + * @kvec: kvec entry to use + * @src: Source address to set in @kvec + * @len: Number of bytes in @src + * @total_len: Pointer to total length variable + * + * Set @src and @len in @kvec, and increase @total_len by @len. + */ +void kdbus_kvec_set(struct kvec *kvec, void *src, size_t len, u64 *total_len) +{ + kvec->iov_base = src; + kvec->iov_len = len; + *total_len += len; +} + +static const char * const zeros = "\0\0\0\0\0\0\0"; + +/** + * kdbus_kvec_pad - conditionally write a padding kvec + * @kvec: kvec entry to use + * @len: Total length used for kvec array + * + * Check if the current total byte length of the array in @len is aligned to + * 8 bytes. If it isn't, fill @kvec with padding information and increase @len + * by the number of bytes stored in @kvec. + * + * Return: the number of added padding bytes. + */ +size_t kdbus_kvec_pad(struct kvec *kvec, u64 *len) +{ + size_t pad = KDBUS_ALIGN8(*len) - *len; + + if (!pad) + return 0; + + kvec->iov_base = (void *)zeros; + kvec->iov_len = pad; + + *len += pad; + + return pad; +} diff -Nur linux-4.2/ipc/kdbus/util.h linux-4.2-pck/ipc/kdbus/util.h --- linux-4.2/ipc/kdbus/util.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/util.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_UTIL_H +#define __KDBUS_UTIL_H + +#include +#include + +#include + +/* all exported addresses are 64 bit */ +#define KDBUS_PTR(addr) ((void __user *)(uintptr_t)(addr)) + +/* all exported sizes are 64 bit and data aligned to 64 bit */ +#define KDBUS_ALIGN8(s) ALIGN((s), 8) +#define KDBUS_IS_ALIGNED8(s) (IS_ALIGNED(s, 8)) + +/** + * kdbus_member_set_user - write a structure member to user memory + * @_s: Variable to copy from + * @_b: Buffer to write to + * @_t: Structure type + * @_m: Member name in the passed structure + * + * Return: the result of copy_to_user() + */ +#define kdbus_member_set_user(_s, _b, _t, _m) \ +({ \ + u64 __user *_sz = \ + (void __user *)((u8 __user *)(_b) + offsetof(_t, _m)); \ + copy_to_user(_sz, _s, FIELD_SIZEOF(_t, _m)); \ +}) + +/** + * kdbus_strhash - calculate a hash + * @str: String + * + * Return: hash value + */ +static inline unsigned int kdbus_strhash(const char *str) +{ + unsigned long hash = init_name_hash(); + + while (*str) + hash = partial_name_hash(*str++, hash); + + return end_name_hash(hash); +} + +int kdbus_verify_uid_prefix(const char *name, struct user_namespace *user_ns, + kuid_t kuid); +int kdbus_sanitize_attach_flags(u64 flags, u64 *attach_flags); + +int kdbus_copy_from_user(void *dest, void __user *user_ptr, size_t size); + +struct kvec; + +void kdbus_kvec_set(struct kvec *kvec, void *src, size_t len, u64 *total_len); +size_t kdbus_kvec_pad(struct kvec *kvec, u64 *len); + +#endif diff -Nur linux-4.2/kernel/debug/kdb/kdb_io.c linux-4.2-pck/kernel/debug/kdb/kdb_io.c --- linux-4.2/kernel/debug/kdb/kdb_io.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/debug/kdb/kdb_io.c 2015-09-08 00:48:22.238363406 -0300 @@ -710,7 +710,7 @@ } } while (c) { - c->write(c, cp, retlen - (cp - kdb_buffer)); + c->write(c, cp, retlen - (cp - kdb_buffer), 7); /* 7 == KERN_DEBUG */ touch_nmi_watchdog(); c = c->next; } @@ -774,7 +774,7 @@ } } while (c) { - c->write(c, moreprompt, strlen(moreprompt)); + c->write(c, moreprompt, strlen(moreprompt), 7); /* 7 == KERN_DEBUG */ touch_nmi_watchdog(); c = c->next; } diff -Nur linux-4.2/kernel/fork.c linux-4.2-pck/kernel/fork.c --- linux-4.2/kernel/fork.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/fork.c 2015-09-08 00:51:03.460668141 -0300 @@ -443,7 +443,7 @@ goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -494,7 +494,7 @@ __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; - + uksm_vma_add_new(tmp); mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); diff -Nur linux-4.2/kernel/printk/printk.c linux-4.2-pck/kernel/printk/printk.c --- linux-4.2/kernel/printk/printk.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/printk/printk.c 2015-09-08 00:48:22.238363406 -0300 @@ -1450,9 +1450,9 @@ !(con->flags & CON_ANYTIME)) continue; if (con->flags & CON_EXTENDED) - con->write(con, ext_text, ext_len); + con->write(con, ext_text, ext_len, level); else - con->write(con, text, len); + con->write(con, text, len, level); } } @@ -1973,7 +1973,7 @@ n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); - early_console->write(early_console, buf, n); + early_console->write(early_console, buf, n, 0); } #endif diff -Nur linux-4.2/kernel/sched/fair.c linux-4.2-pck/kernel/sched/fair.c --- linux-4.2/kernel/sched/fair.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/sched/fair.c 2015-09-08 00:48:22.241696581 -0300 @@ -47,8 +47,13 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned int sysctl_sched_latency = 3000000ULL; +unsigned int normalized_sysctl_sched_latency = 3000000ULL; +#else unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +#endif /* * The initial- and re-scaling of tunables is configurable @@ -66,13 +71,22 @@ * Minimal preemption granularity for CPU-bound tasks: * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned int sysctl_sched_min_granularity = 300000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; +#else unsigned int sysctl_sched_min_granularity = 750000ULL; unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +#endif /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ +#ifdef CONFIG_PCK_INTERACTIVE +static unsigned int sched_nr_latency = 10; +#else static unsigned int sched_nr_latency = 8; +#endif /* * After fork, child runs first. If set to 0 (default) then @@ -88,10 +102,17 @@ * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned int sysctl_sched_wakeup_granularity = 500000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; + +const_debug unsigned int sysctl_sched_migration_cost = 250000UL; +#else unsigned int sysctl_sched_wakeup_granularity = 1000000UL; unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#endif /* * The exponential sliding window over which load is averaged for shares @@ -111,8 +132,12 @@ * * default: 5 msec, units: microseconds */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; +#else unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +#endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) { diff -Nur linux-4.2/kernel/workqueue.c linux-4.2-pck/kernel/workqueue.c --- linux-4.2/kernel/workqueue.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/workqueue.c 2015-09-08 00:48:22.241696581 -0300 @@ -2614,7 +2614,7 @@ out_unlock: mutex_unlock(&wq->mutex); } -EXPORT_SYMBOL_GPL(flush_workqueue); +EXPORT_SYMBOL(flush_workqueue); /** * drain_workqueue - drain a workqueue diff -Nur linux-4.2/lib/Makefile linux-4.2-pck/lib/Makefile --- linux-4.2/lib/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/lib/Makefile 2015-09-08 00:51:03.460668141 -0300 @@ -8,7 +8,7 @@ endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o dump_stack.o timerqueue.o\ + rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ sha1.o md5.o irq_regs.o argv_split.o \ proportions.o flex_proportions.o ratelimit.o show_mem.o \ diff -Nur linux-4.2/lib/sradix-tree.c linux-4.2-pck/lib/sradix-tree.c --- linux-4.2/lib/sradix-tree.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/lib/sradix-tree.c 2015-09-08 00:51:03.460668141 -0300 @@ -0,0 +1,476 @@ +#include +#include +#include +#include +#include +#include +#include + +static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) +{ + return node->fulls == root->stores_size || + (node->height == 1 && node->count == root->stores_size); +} + +/* + * Extend a sradix tree so it can store key @index. + */ +static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) +{ + struct sradix_tree_node *node; + unsigned int height; + + if (unlikely(root->rnode == NULL)) { + if (!(node = root->alloc())) + return -ENOMEM; + + node->height = 1; + root->rnode = node; + root->height = 1; + } + + /* Figure out what the height should be. */ + height = root->height; + index >>= root->shift * height; + + while (index) { + index >>= root->shift; + height++; + } + + while (height > root->height) { + unsigned int newheight; + if (!(node = root->alloc())) + return -ENOMEM; + + /* Increase the height. */ + node->stores[0] = root->rnode; + root->rnode->parent = node; + if (root->extend) + root->extend(node, root->rnode); + + newheight = root->height + 1; + node->height = newheight; + node->count = 1; + if (sradix_node_full(root, root->rnode)) + node->fulls = 1; + + root->rnode = node; + root->height = newheight; + } + + return 0; +} + +/* + * Search the next item from the current node, that is not NULL + * and can satify root->iter(). + */ +void *sradix_tree_next(struct sradix_tree_root *root, + struct sradix_tree_node *node, unsigned long index, + int (*iter)(void *item, unsigned long height)) +{ + unsigned long offset; + void *item; + + if (unlikely(node == NULL)) { + node = root->rnode; + for (offset = 0; offset < root->stores_size; offset++) { + item = node->stores[offset]; + if (item && (!iter || iter(item, node->height))) + break; + } + + if (unlikely(offset >= root->stores_size)) + return NULL; + + if (node->height == 1) + return item; + else + goto go_down; + } + + while (node) { + offset = (index & root->mask) + 1; + for (;offset < root->stores_size; offset++) { + item = node->stores[offset]; + if (item && (!iter || iter(item, node->height))) + break; + } + + if (offset < root->stores_size) + break; + + node = node->parent; + index >>= root->shift; + } + + if (!node) + return NULL; + + while (node->height > 1) { +go_down: + node = item; + for (offset = 0; offset < root->stores_size; offset++) { + item = node->stores[offset]; + if (item && (!iter || iter(item, node->height))) + break; + } + + if (unlikely(offset >= root->stores_size)) + return NULL; + } + + BUG_ON(offset > root->stores_size); + + return item; +} + +/* + * Blindly insert the item to the tree. Typically, we reuse the + * first empty store item. + */ +int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) +{ + unsigned long index; + unsigned int height; + struct sradix_tree_node *node, *tmp = NULL; + int offset, offset_saved; + void **store = NULL; + int error, i, j, shift; + +go_on: + index = root->min; + + if (root->enter_node && !sradix_node_full(root, root->enter_node)) { + node = root->enter_node; + BUG_ON((index >> (root->shift * root->height))); + } else { + node = root->rnode; + if (node == NULL || (index >> (root->shift * root->height)) + || sradix_node_full(root, node)) { + error = sradix_tree_extend(root, index); + if (error) + return error; + + node = root->rnode; + } + } + + + height = node->height; + shift = (height - 1) * root->shift; + offset = (index >> shift) & root->mask; + while (shift > 0) { + offset_saved = offset; + for (; offset < root->stores_size; offset++) { + store = &node->stores[offset]; + tmp = *store; + + if (!tmp || !sradix_node_full(root, tmp)) + break; + } + BUG_ON(offset >= root->stores_size); + + if (offset != offset_saved) { + index += (offset - offset_saved) << shift; + index &= ~((1UL << shift) - 1); + } + + if (!tmp) { + if (!(tmp = root->alloc())) + return -ENOMEM; + + tmp->height = shift / root->shift; + *store = tmp; + tmp->parent = node; + node->count++; +// if (root->extend) +// root->extend(node, tmp); + } + + node = tmp; + shift -= root->shift; + offset = (index >> shift) & root->mask; + } + + BUG_ON(node->height != 1); + + + store = &node->stores[offset]; + for (i = 0, j = 0; + j < root->stores_size - node->count && + i < root->stores_size - offset && j < num; i++) { + if (!store[i]) { + store[i] = item[j]; + if (root->assign) + root->assign(node, index + i, item[j]); + j++; + } + } + + node->count += j; + root->num += j; + num -= j; + + while (sradix_node_full(root, node)) { + node = node->parent; + if (!node) + break; + + node->fulls++; + } + + if (unlikely(!node)) { + /* All nodes are full */ + root->min = 1 << (root->height * root->shift); + root->enter_node = NULL; + } else { + root->min = index + i - 1; + root->min |= (1UL << (node->height - 1)) - 1; + root->min++; + root->enter_node = node; + } + + if (num) { + item += j; + goto go_on; + } + + return 0; +} + + +/** + * sradix_tree_shrink - shrink height of a sradix tree to minimal + * @root sradix tree root + * + */ +static inline void sradix_tree_shrink(struct sradix_tree_root *root) +{ + /* try to shrink tree height */ + while (root->height > 1) { + struct sradix_tree_node *to_free = root->rnode; + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost store, we cannot shrink. + */ + if (to_free->count != 1 || !to_free->stores[0]) + break; + + root->rnode = to_free->stores[0]; + root->rnode->parent = NULL; + root->height--; + if (unlikely(root->enter_node == to_free)) { + root->enter_node = NULL; + } + root->free(to_free); + } +} + +/* + * Del the item on the known leaf node and index + */ +void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, + struct sradix_tree_node *node, unsigned long index) +{ + unsigned int offset; + struct sradix_tree_node *start, *end; + + BUG_ON(node->height != 1); + + start = node; + while (node && !(--node->count)) + node = node->parent; + + end = node; + if (!node) { + root->rnode = NULL; + root->height = 0; + root->min = 0; + root->num = 0; + root->enter_node = NULL; + } else { + offset = (index >> (root->shift * (node->height - 1))) & root->mask; + if (root->rm) + root->rm(node, offset); + node->stores[offset] = NULL; + root->num--; + if (root->min > index) { + root->min = index; + root->enter_node = node; + } + } + + if (start != end) { + do { + node = start; + start = start->parent; + if (unlikely(root->enter_node == node)) + root->enter_node = end; + root->free(node); + } while (start != end); + + /* + * Note that shrink may free "end", so enter_node still need to + * be checked inside. + */ + sradix_tree_shrink(root); + } else if (node->count == root->stores_size - 1) { + /* It WAS a full leaf node. Update the ancestors */ + node = node->parent; + while (node) { + node->fulls--; + if (node->fulls != root->stores_size - 1) + break; + + node = node->parent; + } + } +} + +void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) +{ + unsigned int height, offset; + struct sradix_tree_node *node; + int shift; + + node = root->rnode; + if (node == NULL || (index >> (root->shift * root->height))) + return NULL; + + height = root->height; + shift = (height - 1) * root->shift; + + do { + offset = (index >> shift) & root->mask; + node = node->stores[offset]; + if (!node) + return NULL; + + shift -= root->shift; + } while (shift >= 0); + + return node; +} + +/* + * Return the item if it exists, otherwise create it in place + * and return the created item. + */ +void *sradix_tree_lookup_create(struct sradix_tree_root *root, + unsigned long index, void *(*item_alloc)(void)) +{ + unsigned int height, offset; + struct sradix_tree_node *node, *tmp; + void *item; + int shift, error; + + if (root->rnode == NULL || (index >> (root->shift * root->height))) { + if (item_alloc) { + error = sradix_tree_extend(root, index); + if (error) + return NULL; + } else { + return NULL; + } + } + + node = root->rnode; + height = root->height; + shift = (height - 1) * root->shift; + + do { + offset = (index >> shift) & root->mask; + if (!node->stores[offset]) { + if (!(tmp = root->alloc())) + return NULL; + + tmp->height = shift / root->shift; + node->stores[offset] = tmp; + tmp->parent = node; + node->count++; + node = tmp; + } else { + node = node->stores[offset]; + } + + shift -= root->shift; + } while (shift > 0); + + BUG_ON(node->height != 1); + offset = index & root->mask; + if (node->stores[offset]) { + return node->stores[offset]; + } else if (item_alloc) { + if (!(item = item_alloc())) + return NULL; + + node->stores[offset] = item; + + /* + * NOTE: we do NOT call root->assign here, since this item is + * newly created by us having no meaning. Caller can call this + * if it's necessary to do so. + */ + + node->count++; + root->num++; + + while (sradix_node_full(root, node)) { + node = node->parent; + if (!node) + break; + + node->fulls++; + } + + if (unlikely(!node)) { + /* All nodes are full */ + root->min = 1 << (root->height * root->shift); + } else { + if (root->min == index) { + root->min |= (1UL << (node->height - 1)) - 1; + root->min++; + root->enter_node = node; + } + } + + return item; + } else { + return NULL; + } + +} + +int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) +{ + unsigned int height, offset; + struct sradix_tree_node *node; + int shift; + + node = root->rnode; + if (node == NULL || (index >> (root->shift * root->height))) + return -ENOENT; + + height = root->height; + shift = (height - 1) * root->shift; + + do { + offset = (index >> shift) & root->mask; + node = node->stores[offset]; + if (!node) + return -ENOENT; + + shift -= root->shift; + } while (shift > 0); + + offset = index & root->mask; + if (!node->stores[offset]) + return -ENOENT; + + sradix_tree_delete_from_leaf(root, node, index); + + return 0; +} diff -Nur linux-4.2/mm/Kconfig linux-4.2-pck/mm/Kconfig --- linux-4.2/mm/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/Kconfig 2015-09-08 00:51:03.460668141 -0300 @@ -340,6 +340,32 @@ See Documentation/vm/ksm.txt for more information: KSM is inactive until a program has madvised that an area is MADV_MERGEABLE, and root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). +choice + prompt "Choose UKSM/KSM strategy" + default UKSM + depends on KSM + help + This option allows to select a UKSM/KSM stragety. + +config UKSM + bool "Ultra-KSM for page merging" + depends on KSM + help + UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same + page Merging), but with a fundamentally rewritten core algorithm. With + an advanced algorithm, UKSM now can transparently scans all anonymously + mapped user space applications with an significantly improved scan speed + and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from + UKSM. Now UKSM has its first stable release and first real world enterprise user. + For more information, please goto its project page. + (www.kerneldedup.org) + +config KSM_LEGACY + bool "Legacy KSM implementation" + depends on KSM + help + The legacy KSM implementation from Redhat. +endchoice config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" diff -Nur linux-4.2/mm/Makefile linux-4.2-pck/mm/Makefile --- linux-4.2/mm/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/Makefile 2015-09-08 00:51:03.464001315 -0300 @@ -47,7 +47,8 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o -obj-$(CONFIG_KSM) += ksm.o +obj-$(CONFIG_KSM_LEGACY) += ksm.o +obj-$(CONFIG_UKSM) += uksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o diff -Nur linux-4.2/mm/memory.c linux-4.2-pck/mm/memory.c --- linux-4.2/mm/memory.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/memory.c 2015-09-08 00:53:25.270528186 -0300 @@ -120,6 +120,28 @@ EXPORT_SYMBOL(zero_pfn); +#ifdef CONFIG_UKSM +unsigned long uksm_zero_pfn __read_mostly; +EXPORT_SYMBOL_GPL(uksm_zero_pfn); +struct page *empty_uksm_zero_page; + +static int __init setup_uksm_zero_page(void) +{ + unsigned long addr; + addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0); + if (!addr) + panic("Oh boy, that early out of memory?"); + + empty_uksm_zero_page = virt_to_page((void *) addr); + SetPageReserved(empty_uksm_zero_page); + + uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); + + return 0; +} +core_initcall(setup_uksm_zero_page); +#endif + /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ @@ -131,6 +153,7 @@ core_initcall(init_zero_pfn); + #if defined(SPLIT_RSS_COUNTING) void sync_mm_rss(struct mm_struct *mm) @@ -877,6 +900,11 @@ rss[MM_ANONPAGES]++; else rss[MM_FILEPAGES]++; + + /* Should return NULL in vm_normal_page() */ + uksm_bugon_zeropage(pte); + } else { + uksm_map_zero_page(pte); } out_set_pte: @@ -1110,8 +1138,10 @@ ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); - if (unlikely(!page)) + if (unlikely(!page)) { + uksm_unmap_zero_page(ptent); continue; + } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { @@ -1944,8 +1974,10 @@ clear_page(kaddr); kunmap_atomic(kaddr); flush_dcache_page(dst); - } else + } else { copy_user_highpage(dst, src, va, vma); + uksm_cow_page(vma, src); + } } /* @@ -2075,6 +2107,7 @@ new_page = alloc_zeroed_user_highpage_movable(vma, address); if (!new_page) goto oom; + uksm_cow_pte(vma, orig_pte); } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) @@ -2099,7 +2132,9 @@ dec_mm_counter_fast(mm, MM_FILEPAGES); inc_mm_counter_fast(mm, MM_ANONPAGES); } + uksm_bugon_zeropage(orig_pte); } else { + uksm_unmap_zero_page(orig_pte); inc_mm_counter_fast(mm, MM_ANONPAGES); } flush_cache_page(vma, address, pte_pfn(orig_pte)); diff -Nur linux-4.2/mm/mmap.c linux-4.2-pck/mm/mmap.c --- linux-4.2/mm/mmap.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/mmap.c 2015-09-08 00:51:03.464001315 -0300 @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -276,6 +277,7 @@ if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); + uksm_remove_vma(vma); kmem_cache_free(vm_area_cachep, vma); return next; } @@ -733,9 +735,16 @@ long adjust_next = 0; int remove_next = 0; +/* + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is + * acquired + */ + uksm_remove_vma(vma); + if (next && !insert) { struct vm_area_struct *exporter = NULL; + uksm_remove_vma(next); if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -829,6 +838,7 @@ end_changed = true; } vma->vm_pgoff = pgoff; + if (adjust_next) { next->vm_start += adjust_next << PAGE_SHIFT; next->vm_pgoff += adjust_next; @@ -899,16 +909,22 @@ * up the code too much to do both in one go. */ next = vma->vm_next; - if (remove_next == 2) + if (remove_next == 2) { + uksm_remove_vma(next); goto again; - else if (next) + } else if (next) { vma_gap_update(next); - else + } else { mm->highest_vm_end = end; + } + } else { + if (next && !insert) + uksm_vma_add_new(next); } if (insert && file) uprobe_mmap(insert); + uksm_vma_add_new(vma); validate_mm(mm); return 0; @@ -1301,6 +1317,9 @@ vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */ + uksm_vm_flags_mod(&vm_flags); + if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; @@ -1642,6 +1661,7 @@ allow_write_access(file); } file = vma->vm_file; + uksm_vma_add_new(vma); out: perf_event_mmap(vma); @@ -1683,6 +1703,7 @@ if (vm_flags & VM_DENYWRITE) allow_write_access(file); free_vma: + uksm_remove_vma(vma); kmem_cache_free(vm_area_cachep, vma); unacct_error: if (charged) @@ -2484,6 +2505,8 @@ else err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + uksm_vma_add_new(new); + /* Success. */ if (!err) return 0; @@ -2721,6 +2744,7 @@ return addr; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + uksm_vm_flags_mod(&flags); error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (error & ~PAGE_MASK) @@ -2778,6 +2802,7 @@ vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); vma_link(mm, vma, prev, rb_link, rb_parent); + uksm_vma_add_new(vma); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -2813,6 +2838,12 @@ /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); + /* + * Taking write lock on mmap_sem does not harm others, + * but it's crucial for uksm to avoid races. + */ + down_write(&mm->mmap_sem); + if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -2848,6 +2879,11 @@ vma = remove_vma(vma); } vm_unacct_memory(nr_accounted); + + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + vmacache_invalidate(mm); + up_write(&mm->mmap_sem); } /* Insert vm structure into process list sorted by address @@ -2954,6 +2990,7 @@ new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); *need_rmap_locks = false; + uksm_vma_add_new(new_vma); } } return new_vma; @@ -3067,10 +3104,10 @@ ret = insert_vm_struct(mm, vma); if (ret) goto out; - mm->total_vm += len >> PAGE_SHIFT; perf_event_mmap(vma); + uksm_vma_add_new(vma); return vma; diff -Nur linux-4.2/mm/page-writeback.c linux-4.2-pck/mm/page-writeback.c --- linux-4.2/mm/page-writeback.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/page-writeback.c 2015-09-08 00:48:22.241696581 -0300 @@ -70,13 +70,21 @@ /* * Start background writeback (via writeback threads) at this percentage */ +#ifdef CONFIG_PCK_INTERACTIVE +int dirty_background_ratio; +#else int dirty_background_ratio = 10; +#endif /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned long dirty_background_bytes = 128 * 1024 * 1024; +#else unsigned long dirty_background_bytes; +#endif /* * free highmem will not be subtracted from the total free memory @@ -87,13 +95,21 @@ /* * The generator of dirty data starts writeback at this percentage */ +#ifdef CONFIG_PCK_INTERACTIVE +int vm_dirty_ratio; +#else int vm_dirty_ratio = 20; +#endif /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned long vm_dirty_bytes = 256 * 1024 * 1024; +#else unsigned long vm_dirty_bytes; +#endif /* * The interval between `kupdate'-style writebacks diff -Nur linux-4.2/mm/rmap.c linux-4.2-pck/mm/rmap.c --- linux-4.2/mm/rmap.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/rmap.c 2015-09-08 00:51:03.464001315 -0300 @@ -962,9 +962,9 @@ /** * __page_set_anon_rmap - set up new anonymous rmap - * @page: Page to add to rmap + * @page: Page to add to rmap * @vma: VM area to add page to. - * @address: User virtual address of the mapping + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct page *page, diff -Nur linux-4.2/mm/uksm.c linux-4.2-pck/mm/uksm.c --- linux-4.2/mm/uksm.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/mm/uksm.c 2015-09-08 00:51:03.470667662 -0300 @@ -0,0 +1,5525 @@ +/* + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia + * + * This is an improvement upon KSM. Some basic data structures and routines + * are borrowed from ksm.c . + * + * Its new features: + * 1. Full system scan: + * It automatically scans all user processes' anonymous VMAs. Kernel-user + * interaction to submit a memory area to KSM is no longer needed. + * + * 2. Rich area detection: + * It automatically detects rich areas containing abundant duplicated + * pages based. Rich areas are given a full scan speed. Poor areas are + * sampled at a reasonable speed with very low CPU consumption. + * + * 3. Ultra Per-page scan speed improvement: + * A new hash algorithm is proposed. As a result, on a machine with + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it + * can scan memory areas that does not contain duplicated pages at speed of + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of + * 477MB/sec ~ 923MB/sec. + * + * 4. Thrashing area avoidance: + * Thrashing area(an VMA that has frequent Ksm page break-out) can be + * filtered out. My benchmark shows it's more efficient than KSM's per-page + * hash value based volatile page detection. + * + * + * 5. Misc changes upon KSM: + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page + * comparison. It's much faster than default C version on x86. + * * rmap_item now has an struct *page member to loosely cache a + * address-->page mapping, which reduces too much time-costly + * follow_page(). + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ + * ksm is needed for this case. + * + * 6. Full Zero Page consideration(contributed by Figo Zhang) + * Now uksmd consider full zero pages as special pages and merge them to an + * special unswappable uksm zero page. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "internal.h" + +#ifdef CONFIG_X86 +#undef memcmp + +#ifdef CONFIG_X86_32 +#define memcmp memcmpx86_32 +/* + * Compare 4-byte-aligned address s1 and s2, with length n + */ +int memcmpx86_32(void *s1, void *s2, size_t n) +{ + size_t num = n / 4; + register int res; + + __asm__ __volatile__ + ( + "testl %3,%3\n\t" + "repe; cmpsd\n\t" + "je 1f\n\t" + "sbbl %0,%0\n\t" + "orl $1,%0\n" + "1:" + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) + : "0" (0) + : "cc"); + + return res; +} + +/* + * Check the page is all zero ? + */ +static int is_full_zero(const void *s1, size_t len) +{ + unsigned char same; + + len /= 4; + + __asm__ __volatile__ + ("repe; scasl;" + "sete %0" + : "=qm" (same), "+D" (s1), "+c" (len) + : "a" (0) + : "cc"); + + return same; +} + + +#elif defined(CONFIG_X86_64) +#define memcmp memcmpx86_64 +/* + * Compare 8-byte-aligned address s1 and s2, with length n + */ +int memcmpx86_64(void *s1, void *s2, size_t n) +{ + size_t num = n / 8; + register int res; + + __asm__ __volatile__ + ( + "testq %q3,%q3\n\t" + "repe; cmpsq\n\t" + "je 1f\n\t" + "sbbq %q0,%q0\n\t" + "orq $1,%q0\n" + "1:" + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) + : "0" (0) + : "cc"); + + return res; +} + +static int is_full_zero(const void *s1, size_t len) +{ + unsigned char same; + + len /= 8; + + __asm__ __volatile__ + ("repe; scasq;" + "sete %0" + : "=qm" (same), "+D" (s1), "+c" (len) + : "a" (0) + : "cc"); + + return same; +} + +#endif +#else +static int is_full_zero(const void *s1, size_t len) +{ + unsigned long *src = s1; + int i; + + len /= sizeof(*src); + + for (i = 0; i < len; i++) { + if (src[i]) + return 0; + } + + return 1; +} +#endif + +#define UKSM_RUNG_ROUND_FINISHED (1 << 0) +#define TIME_RATIO_SCALE 10000 + +#define SLOT_TREE_NODE_SHIFT 8 +#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) +struct slot_tree_node { + unsigned long size; + struct sradix_tree_node snode; + void *stores[SLOT_TREE_NODE_STORE_SIZE]; +}; + +static struct kmem_cache *slot_tree_node_cachep; + +static struct sradix_tree_node *slot_tree_node_alloc(void) +{ + struct slot_tree_node *p; + p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL); + if (!p) + return NULL; + + return &p->snode; +} + +static void slot_tree_node_free(struct sradix_tree_node *node) +{ + struct slot_tree_node *p; + + p = container_of(node, struct slot_tree_node, snode); + kmem_cache_free(slot_tree_node_cachep, p); +} + +static void slot_tree_node_extend(struct sradix_tree_node *parent, + struct sradix_tree_node *child) +{ + struct slot_tree_node *p, *c; + + p = container_of(parent, struct slot_tree_node, snode); + c = container_of(child, struct slot_tree_node, snode); + + p->size += c->size; +} + +void slot_tree_node_assign(struct sradix_tree_node *node, + unsigned index, void *item) +{ + struct vma_slot *slot = item; + struct slot_tree_node *cur; + + slot->snode = node; + slot->sindex = index; + + while (node) { + cur = container_of(node, struct slot_tree_node, snode); + cur->size += slot->pages; + node = node->parent; + } +} + +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset) +{ + struct vma_slot *slot; + struct slot_tree_node *cur; + unsigned long pages; + + if (node->height == 1) { + slot = node->stores[offset]; + pages = slot->pages; + } else { + cur = container_of(node->stores[offset], + struct slot_tree_node, snode); + pages = cur->size; + } + + while (node) { + cur = container_of(node, struct slot_tree_node, snode); + cur->size -= pages; + node = node->parent; + } +} + +unsigned long slot_iter_index; +int slot_iter(void *item, unsigned long height) +{ + struct slot_tree_node *node; + struct vma_slot *slot; + + if (height == 1) { + slot = item; + if (slot_iter_index < slot->pages) { + /*in this one*/ + return 1; + } else { + slot_iter_index -= slot->pages; + return 0; + } + + } else { + node = container_of(item, struct slot_tree_node, snode); + if (slot_iter_index < node->size) { + /*in this one*/ + return 1; + } else { + slot_iter_index -= node->size; + return 0; + } + } +} + + +static inline void slot_tree_init_root(struct sradix_tree_root *root) +{ + init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); + root->alloc = slot_tree_node_alloc; + root->free = slot_tree_node_free; + root->extend = slot_tree_node_extend; + root->assign = slot_tree_node_assign; + root->rm = slot_tree_node_rm; +} + +void slot_tree_init(void) +{ + slot_tree_node_cachep = kmem_cache_create("slot_tree_node", + sizeof(struct slot_tree_node), 0, + SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, + NULL); +} + + +/* Each rung of this ladder is a list of VMAs having a same scan ratio */ +struct scan_rung { + //struct list_head scanned_list; + struct sradix_tree_root vma_root; + struct sradix_tree_root vma_root2; + + struct vma_slot *current_scan; + unsigned long current_offset; + + /* + * The initial value for current_offset, it should loop over + * [0~ step - 1] to let all slot have its chance to be scanned. + */ + unsigned long offset_init; + unsigned long step; /* dynamic step for current_offset */ + unsigned int flags; + unsigned long pages_to_scan; + //unsigned long fully_scanned_slots; + /* + * a little bit tricky - if cpu_time_ratio > 0, then the value is the + * the cpu time ratio it can spend in rung_i for every scan + * period. if < 0, then it is the cpu time ratio relative to the + * max cpu percentage user specified. Both in unit of + * 1/TIME_RATIO_SCALE + */ + int cpu_ratio; + + /* + * How long it will take for all slots in this rung to be fully + * scanned? If it's zero, we don't care about the cover time: + * it's fully scanned. + */ + unsigned int cover_msecs; + //unsigned long vma_num; + //unsigned long pages; /* Sum of all slot's pages in rung */ +}; + +/** + * node of either the stable or unstale rbtree + * + */ +struct tree_node { + struct rb_node node; /* link in the main (un)stable rbtree */ + struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ + u32 hash; + unsigned long count; /* TODO: merged with sub_root */ + struct list_head all_list; /* all tree nodes in stable/unstable tree */ +}; + +/** + * struct stable_node - node of the stable rbtree + * @node: rb node of this ksm page in the stable tree + * @hlist: hlist head of rmap_items using this ksm page + * @kpfn: page frame number of this ksm page + */ +struct stable_node { + struct rb_node node; /* link in sub-rbtree */ + struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ + struct hlist_head hlist; + unsigned long kpfn; + u32 hash_max; /* if ==0 then it's not been calculated yet */ + struct list_head all_list; /* in a list for all stable nodes */ +}; + +/** + * struct node_vma - group rmap_items linked in a same stable + * node together. + */ +struct node_vma { + union { + struct vma_slot *slot; + unsigned long key; /* slot is used as key sorted on hlist */ + }; + struct hlist_node hlist; + struct hlist_head rmap_hlist; + struct stable_node *head; +}; + +/** + * struct rmap_item - reverse mapping item for virtual addresses + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree + * @mm: the memory structure this rmap_item is pointing into + * @address: the virtual address this rmap_item tracks (+ flags in low bits) + * @node: rb node of this rmap_item in the unstable tree + * @head: pointer to stable_node heading this list in the stable tree + * @hlist: link into hlist of rmap_items hanging off that stable_node + */ +struct rmap_item { + struct vma_slot *slot; + struct page *page; + unsigned long address; /* + low bits used for flags below */ + unsigned long hash_round; + unsigned long entry_index; + union { + struct {/* when in unstable tree */ + struct rb_node node; + struct tree_node *tree_node; + u32 hash_max; + }; + struct { /* when in stable tree */ + struct node_vma *head; + struct hlist_node hlist; + struct anon_vma *anon_vma; + }; + }; +} __attribute__((aligned(4))); + +struct rmap_list_entry { + union { + struct rmap_item *item; + unsigned long addr; + }; + /* lowest bit is used for is_addr tag */ +} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/ + + +/* Basic data structure definition ends */ + + +/* + * Flags for rmap_item to judge if it's listed in the stable/unstable tree. + * The flags use the low bits of rmap_item.address + */ +#define UNSTABLE_FLAG 0x1 +#define STABLE_FLAG 0x2 +#define get_rmap_addr(x) ((x)->address & PAGE_MASK) + +/* + * rmap_list_entry helpers + */ +#define IS_ADDR_FLAG 1 +#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) +#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) +#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) + + +/* + * High speed caches for frequently allocated and freed structs + */ +static struct kmem_cache *rmap_item_cache; +static struct kmem_cache *stable_node_cache; +static struct kmem_cache *node_vma_cache; +static struct kmem_cache *vma_slot_cache; +static struct kmem_cache *tree_node_cache; +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ + sizeof(struct __struct), __alignof__(struct __struct),\ + (__flags), NULL) + +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ +#define SCAN_LADDER_SIZE 4 +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; + +/* The evaluation rounds uksmd has finished */ +static unsigned long long uksm_eval_round = 1; + +/* + * we add 1 to this var when we consider we should rebuild the whole + * unstable tree. + */ +static unsigned long uksm_hash_round = 1; + +/* + * How many times the whole memory is scanned. + */ +static unsigned long long fully_scanned_round = 1; + +/* The total number of virtual pages of all vma slots */ +static u64 uksm_pages_total; + +/* The number of pages has been scanned since the start up */ +static u64 uksm_pages_scanned; + +static u64 scanned_virtual_pages; + +/* The number of pages has been scanned since last encode_benefit call */ +static u64 uksm_pages_scanned_last; + +/* If the scanned number is tooo large, we encode it here */ +static u64 pages_scanned_stored; + +static unsigned long pages_scanned_base; + +/* The number of nodes in the stable tree */ +static unsigned long uksm_pages_shared; + +/* The number of page slots additionally sharing those nodes */ +static unsigned long uksm_pages_sharing; + +/* The number of nodes in the unstable tree */ +static unsigned long uksm_pages_unshared; + +/* + * Milliseconds ksmd should sleep between scans, + * >= 100ms to be consistent with + * scan_time_to_sleep_msec() + */ +static unsigned int uksm_sleep_jiffies; + +/* The real value for the uksmd next sleep */ +static unsigned int uksm_sleep_real; + +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ +static unsigned int uksm_sleep_saved; + +/* Max percentage of cpu utilization ksmd can take to scan in one batch */ +static unsigned int uksm_max_cpu_percentage; + +static int uksm_cpu_governor; + +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; + +struct uksm_cpu_preset_s { + int cpu_ratio[SCAN_LADDER_SIZE]; + unsigned int cover_msecs[SCAN_LADDER_SIZE]; + unsigned int max_cpu; /* percentage */ +}; + +struct uksm_cpu_preset_s uksm_cpu_preset[4] = { + { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, + { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, + { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, + { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, +}; + +/* The default value for uksm_ema_page_time if it's not initialized */ +#define UKSM_PAGE_TIME_DEFAULT 500 + +/*cost to scan one page by expotional moving average in nsecs */ +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; + +/* The expotional moving average alpha weight, in percentage. */ +#define EMA_ALPHA 20 + +/* + * The threshold used to filter out thrashing areas, + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio + * will be considered as having a zero duplication ratio. + */ +static unsigned int uksm_thrash_threshold = 50; + +/* How much dedup ratio is considered to be abundant*/ +static unsigned int uksm_abundant_threshold = 10; + +/* All slots having merged pages in this eval round. */ +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); + +/* How many times the ksmd has slept since startup */ +static unsigned long long uksm_sleep_times; + +#define UKSM_RUN_STOP 0 +#define UKSM_RUN_MERGE 1 +static unsigned int uksm_run = 1; + +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); +static DEFINE_MUTEX(uksm_thread_mutex); + +/* + * List vma_slot_new is for newly created vma_slot waiting to be added by + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding + * VMA has been removed/freed. + */ +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); +static DEFINE_SPINLOCK(vma_slot_list_lock); + +/* The unstable tree heads */ +static struct rb_root root_unstable_tree = RB_ROOT; + +/* + * All tree_nodes are in a list to be freed at once when unstable tree is + * freed after each scan round. + */ +static struct list_head unstable_tree_node_list = + LIST_HEAD_INIT(unstable_tree_node_list); + +/* List contains all stable nodes */ +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); + +/* + * When the hash strength is changed, the stable tree must be delta_hashed and + * re-structured. We use two set of below structs to speed up the + * re-structuring of stable tree. + */ +static struct list_head +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), + LIST_HEAD_INIT(stable_tree_node_list[1])}; + +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; +static struct rb_root *root_stable_treep = &root_stable_tree[0]; +static unsigned long stable_tree_index; + +/* The hash strength needed to hash a full page */ +#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) + +/* The hash strength needed for loop-back hashing */ +#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) + +/* The random offsets in a page */ +static u32 *random_nums; + +/* The hash strength */ +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; + +/* The delta value each time the hash strength increases or decreases */ +static unsigned long hash_strength_delta; +#define HASH_STRENGTH_DELTA_MAX 5 + +/* The time we have saved due to random_sample_hash */ +static u64 rshash_pos; + +/* The time we have wasted due to hash collision */ +static u64 rshash_neg; + +struct uksm_benefit { + u64 pos; + u64 neg; + u64 scanned; + unsigned long base; +} benefit; + +/* + * The relative cost of memcmp, compared to 1 time unit of random sample + * hash, this value is tested when ksm module is initialized + */ +static unsigned long memcmp_cost; + +static unsigned long rshash_neg_cont_zero; +static unsigned long rshash_cont_obscure; + +/* The possible states of hash strength adjustment heuristic */ +enum rshash_states { + RSHASH_STILL, + RSHASH_TRYUP, + RSHASH_TRYDOWN, + RSHASH_NEW, + RSHASH_PRE_STILL, +}; + +/* The possible direction we are about to adjust hash strength */ +enum rshash_direct { + GO_UP, + GO_DOWN, + OBSCURE, + STILL, +}; + +/* random sampling hash state machine */ +static struct { + enum rshash_states state; + enum rshash_direct pre_direct; + u8 below_count; + /* Keep a lookup window of size 5, iff above_count/below_count > 3 + * in this window we stop trying. + */ + u8 lookup_window_index; + u64 stable_benefit; + unsigned long turn_point_down; + unsigned long turn_benefit_down; + unsigned long turn_point_up; + unsigned long turn_benefit_up; + unsigned long stable_point; +} rshash_state; + +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ +static u32 *zero_hash_table; + +static inline struct node_vma *alloc_node_vma(void) +{ + struct node_vma *node_vma; + node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL); + if (node_vma) { + INIT_HLIST_HEAD(&node_vma->rmap_hlist); + INIT_HLIST_NODE(&node_vma->hlist); + } + return node_vma; +} + +static inline void free_node_vma(struct node_vma *node_vma) +{ + kmem_cache_free(node_vma_cache, node_vma); +} + + +static inline struct vma_slot *alloc_vma_slot(void) +{ + struct vma_slot *slot; + + /* + * In case ksm is not initialized by now. + * Oops, we need to consider the call site of uksm_init() in the future. + */ + if (!vma_slot_cache) + return NULL; + + slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL); + if (slot) { + INIT_LIST_HEAD(&slot->slot_list); + INIT_LIST_HEAD(&slot->dedup_list); + slot->flags |= UKSM_SLOT_NEED_RERAND; + } + return slot; +} + +static inline void free_vma_slot(struct vma_slot *vma_slot) +{ + kmem_cache_free(vma_slot_cache, vma_slot); +} + + + +static inline struct rmap_item *alloc_rmap_item(void) +{ + struct rmap_item *rmap_item; + + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + if (rmap_item) { + /* bug on lowest bit is not clear for flag use */ + BUG_ON(is_addr(rmap_item)); + } + return rmap_item; +} + +static inline void free_rmap_item(struct rmap_item *rmap_item) +{ + rmap_item->slot = NULL; /* debug safety */ + kmem_cache_free(rmap_item_cache, rmap_item); +} + +static inline struct stable_node *alloc_stable_node(void) +{ + struct stable_node *node; + node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC); + if (!node) + return NULL; + + INIT_HLIST_HEAD(&node->hlist); + list_add(&node->all_list, &stable_node_list); + return node; +} + +static inline void free_stable_node(struct stable_node *stable_node) +{ + list_del(&stable_node->all_list); + kmem_cache_free(stable_node_cache, stable_node); +} + +static inline struct tree_node *alloc_tree_node(struct list_head *list) +{ + struct tree_node *node; + node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC); + if (!node) + return NULL; + + list_add(&node->all_list, list); + return node; +} + +static inline void free_tree_node(struct tree_node *node) +{ + list_del(&node->all_list); + kmem_cache_free(tree_node_cache, node); +} + +static void uksm_drop_anon_vma(struct rmap_item *rmap_item) +{ + struct anon_vma *anon_vma = rmap_item->anon_vma; + + put_anon_vma(anon_vma); +} + + +/** + * Remove a stable node from stable_tree, may unlink from its tree_node and + * may remove its parent tree_node if no other stable node is pending. + * + * @stable_node The node need to be removed + * @unlink_rb Will this node be unlinked from the rbtree? + * @remove_tree_ node Will its tree_node be removed if empty? + */ +static void remove_node_from_stable_tree(struct stable_node *stable_node, + int unlink_rb, int remove_tree_node) +{ + struct node_vma *node_vma; + struct rmap_item *rmap_item; + struct hlist_node *n; + + if (!hlist_empty(&stable_node->hlist)) { + hlist_for_each_entry_safe(node_vma, n, + &stable_node->hlist, hlist) { + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { + uksm_pages_sharing--; + + uksm_drop_anon_vma(rmap_item); + rmap_item->address &= PAGE_MASK; + } + free_node_vma(node_vma); + cond_resched(); + } + + /* the last one is counted as shared */ + uksm_pages_shared--; + uksm_pages_sharing++; + } + + if (stable_node->tree_node && unlink_rb) { + rb_erase(&stable_node->node, + &stable_node->tree_node->sub_root); + + if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && + remove_tree_node) { + rb_erase(&stable_node->tree_node->node, + root_stable_treep); + free_tree_node(stable_node->tree_node); + } else { + stable_node->tree_node->count--; + } + } + + free_stable_node(stable_node); +} + + +/* + * get_uksm_page: checks if the page indicated by the stable node + * is still its ksm page, despite having held no reference to it. + * In which case we can trust the content of the page, and it + * returns the gotten page; but if the page has now been zapped, + * remove the stale node from the stable tree and return NULL. + * + * You would expect the stable_node to hold a reference to the ksm page. + * But if it increments the page's count, swapping out has to wait for + * ksmd to come around again before it can free the page, which may take + * seconds or even minutes: much too unresponsive. So instead we use a + * "keyhole reference": access to the ksm page from the stable node peeps + * out through its keyhole to see if that page still holds the right key, + * pointing back to this stable node. This relies on freeing a PageAnon + * page to reset its page->mapping to NULL, and relies on no other use of + * a page to put something that might look like our key in page->mapping. + * + * include/linux/pagemap.h page_cache_get_speculative() is a good reference, + * but this is different - made simpler by uksm_thread_mutex being held, but + * interesting for assuming that no other use of the struct page could ever + * put our expected_mapping into page->mapping (or a field of the union which + * coincides with page->mapping). The RCU calls are not for KSM at all, but + * to keep the page_count protocol described with page_cache_get_speculative. + * + * Note: it is possible that get_uksm_page() will return NULL one moment, + * then page the next, if the page is in between page_freeze_refs() and + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page + * is on its way to being freed; but it is an anomaly to bear in mind. + * + * @unlink_rb: if the removal of this node will firstly unlink from + * its rbtree. stable_node_reinsert will prevent this when restructuring the + * node from its old tree. + * + * @remove_tree_node: if this is the last one of its tree_node, will the + * tree_node be freed ? If we are inserting stable node, this tree_node may + * be reused, so don't free it. + */ +static struct page *get_uksm_page(struct stable_node *stable_node, + int unlink_rb, int remove_tree_node) +{ + struct page *page; + void *expected_mapping; + + page = pfn_to_page(stable_node->kpfn); + expected_mapping = (void *)stable_node + + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + rcu_read_lock(); + if (page->mapping != expected_mapping) + goto stale; + if (!get_page_unless_zero(page)) + goto stale; + if (page->mapping != expected_mapping) { + put_page(page); + goto stale; + } + rcu_read_unlock(); + return page; +stale: + rcu_read_unlock(); + remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); + + return NULL; +} + +/* + * Removing rmap_item from stable or unstable tree. + * This function will clean the information from the stable/unstable tree. + */ +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +{ + if (rmap_item->address & STABLE_FLAG) { + struct stable_node *stable_node; + struct node_vma *node_vma; + struct page *page; + + node_vma = rmap_item->head; + stable_node = node_vma->head; + page = get_uksm_page(stable_node, 1, 1); + if (!page) + goto out; + + /* + * page lock is needed because it's racing with + * try_to_unmap_ksm(), etc. + */ + lock_page(page); + hlist_del(&rmap_item->hlist); + + if (hlist_empty(&node_vma->rmap_hlist)) { + hlist_del(&node_vma->hlist); + free_node_vma(node_vma); + } + unlock_page(page); + + put_page(page); + if (hlist_empty(&stable_node->hlist)) { + /* do NOT call remove_node_from_stable_tree() here, + * it's possible for a forked rmap_item not in + * stable tree while the in-tree rmap_items were + * deleted. + */ + uksm_pages_shared--; + } else + uksm_pages_sharing--; + + + uksm_drop_anon_vma(rmap_item); + } else if (rmap_item->address & UNSTABLE_FLAG) { + if (rmap_item->hash_round == uksm_hash_round) { + + rb_erase(&rmap_item->node, + &rmap_item->tree_node->sub_root); + if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { + rb_erase(&rmap_item->tree_node->node, + &root_unstable_tree); + + free_tree_node(rmap_item->tree_node); + } else + rmap_item->tree_node->count--; + } + uksm_pages_unshared--; + } + + rmap_item->address &= PAGE_MASK; + rmap_item->hash_max = 0; + +out: + cond_resched(); /* we're called from many long loops */ +} + +static inline int slot_in_uksm(struct vma_slot *slot) +{ + return list_empty(&slot->slot_list); +} + +/* + * Test if the mm is exiting + */ +static inline bool uksm_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +static inline unsigned long vma_pool_size(struct vma_slot *slot) +{ + return round_up(sizeof(struct rmap_list_entry) * slot->pages, + PAGE_SIZE) >> PAGE_SHIFT; +} + +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) + +/* must be done with sem locked */ +static int slot_pool_alloc(struct vma_slot *slot) +{ + unsigned long pool_size; + + if (slot->rmap_list_pool) + return 0; + + pool_size = vma_pool_size(slot); + slot->rmap_list_pool = kzalloc(sizeof(struct page *) * + pool_size, GFP_KERNEL); + if (!slot->rmap_list_pool) + return -ENOMEM; + + slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size, + GFP_KERNEL); + if (!slot->pool_counts) { + kfree(slot->rmap_list_pool); + return -ENOMEM; + } + + slot->pool_size = pool_size; + BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); + slot->flags |= UKSM_SLOT_IN_UKSM; + uksm_pages_total += slot->pages; + + return 0; +} + +/* + * Called after vma is unlinked from its mm + */ +void uksm_remove_vma(struct vm_area_struct *vma) +{ + struct vma_slot *slot; + + if (!vma->uksm_vma_slot) + return; + + spin_lock(&vma_slot_list_lock); + slot = vma->uksm_vma_slot; + if (!slot) + goto out; + + if (slot_in_uksm(slot)) { + /** + * This slot has been added by ksmd, so move to the del list + * waiting ksmd to free it. + */ + list_add_tail(&slot->slot_list, &vma_slot_del); + } else { + /** + * It's still on new list. It's ok to free slot directly. + */ + list_del(&slot->slot_list); + free_vma_slot(slot); + } +out: + vma->uksm_vma_slot = NULL; + spin_unlock(&vma_slot_list_lock); +} + +/** + * Need to do two things: + * 1. check if slot was moved to del list + * 2. make sure the mmap_sem is manipulated under valid vma. + * + * My concern here is that in some cases, this may make + * vma_slot_list_lock() waiters to serialized further by some + * sem->wait_lock, can this really be expensive? + * + * + * @return + * 0: if successfully locked mmap_sem + * -ENOENT: this slot was moved to del list + * -EBUSY: vma lock failed + */ +static int try_down_read_slot_mmap_sem(struct vma_slot *slot) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + struct rw_semaphore *sem; + + spin_lock(&vma_slot_list_lock); + + /* the slot_list was removed and inited from new list, when it enters + * uksm_list. If now it's not empty, then it must be moved to del list + */ + if (!slot_in_uksm(slot)) { + spin_unlock(&vma_slot_list_lock); + return -ENOENT; + } + + BUG_ON(slot->pages != vma_pages(slot->vma)); + /* Ok, vma still valid */ + vma = slot->vma; + mm = vma->vm_mm; + sem = &mm->mmap_sem; + + if (uksm_test_exit(mm)) { + spin_unlock(&vma_slot_list_lock); + return -ENOENT; + } + + if (down_read_trylock(sem)) { + spin_unlock(&vma_slot_list_lock); + if (slot_pool_alloc(slot)) { + uksm_remove_vma(vma); + up_read(sem); + return -ENOENT; + } + return 0; + } + + spin_unlock(&vma_slot_list_lock); + return -EBUSY; +} + +static inline unsigned long +vma_page_address(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { + /* page should be within @vma mapping range */ + return -EFAULT; + } + return address; +} + + +/* return 0 on success with the item's mmap_sem locked */ +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) +{ + struct mm_struct *mm; + struct vma_slot *slot = item->slot; + int err = -EINVAL; + + struct page *page; + + /* + * try_down_read_slot_mmap_sem() returns non-zero if the slot + * has been removed by uksm_remove_vma(). + */ + if (try_down_read_slot_mmap_sem(slot)) + return -EBUSY; + + mm = slot->vma->vm_mm; + + if (uksm_test_exit(mm)) + goto failout_up; + + page = item->page; + rcu_read_lock(); + if (!get_page_unless_zero(page)) { + rcu_read_unlock(); + goto failout_up; + } + + /* No need to consider huge page here. */ + if (item->slot->vma->anon_vma != page_anon_vma(page) || + vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { + /* + * TODO: + * should we release this item becase of its stale page + * mapping? + */ + put_page(page); + rcu_read_unlock(); + goto failout_up; + } + rcu_read_unlock(); + return 0; + +failout_up: + up_read(&mm->mmap_sem); + return err; +} + +/* + * What kind of VMA is considered ? + */ +static inline int vma_can_enter(struct vm_area_struct *vma) +{ + return uksm_flags_can_scan(vma->vm_flags); +} + +/* + * Called whenever a fresh new vma is created A new vma_slot. + * is created and inserted into a global list Must be called. + * after vma is inserted to its mm . + */ +void uksm_vma_add_new(struct vm_area_struct *vma) +{ + struct vma_slot *slot; + + if (!vma_can_enter(vma)) { + vma->uksm_vma_slot = NULL; + return; + } + + slot = alloc_vma_slot(); + if (!slot) { + vma->uksm_vma_slot = NULL; + return; + } + + vma->uksm_vma_slot = slot; + vma->vm_flags |= VM_MERGEABLE; + slot->vma = vma; + slot->mm = vma->vm_mm; + slot->ctime_j = jiffies; + slot->pages = vma_pages(vma); + spin_lock(&vma_slot_list_lock); + list_add_tail(&slot->slot_list, &vma_slot_new); + spin_unlock(&vma_slot_list_lock); +} + +/* 32/3 < they < 32/2 */ +#define shiftl 8 +#define shiftr 12 + +#define HASH_FROM_TO(from, to) \ +for (index = from; index < to; index++) { \ + pos = random_nums[index]; \ + hash += key[pos]; \ + hash += (hash << shiftl); \ + hash ^= (hash >> shiftr); \ +} + + +#define HASH_FROM_DOWN_TO(from, to) \ +for (index = from - 1; index >= to; index--) { \ + hash ^= (hash >> shiftr); \ + hash ^= (hash >> (shiftr*2)); \ + hash -= (hash << shiftl); \ + hash += (hash << (shiftl*2)); \ + pos = random_nums[index]; \ + hash -= key[pos]; \ +} + +/* + * The main random sample hash function. + */ +static u32 random_sample_hash(void *addr, u32 hash_strength) +{ + u32 hash = 0xdeadbeef; + int index, pos, loop = hash_strength; + u32 *key = (u32 *)addr; + + if (loop > HASH_STRENGTH_FULL) + loop = HASH_STRENGTH_FULL; + + HASH_FROM_TO(0, loop); + + if (hash_strength > HASH_STRENGTH_FULL) { + loop = hash_strength - HASH_STRENGTH_FULL; + HASH_FROM_TO(0, loop); + } + + return hash; +} + + +/** + * It's used when hash strength is adjusted + * + * @addr The page's virtual address + * @from The original hash strength + * @to The hash strength changed to + * @hash The hash value generated with "from" hash value + * + * return the hash value + */ +static u32 delta_hash(void *addr, int from, int to, u32 hash) +{ + u32 *key = (u32 *)addr; + int index, pos; /* make sure they are int type */ + + if (to > from) { + if (from >= HASH_STRENGTH_FULL) { + from -= HASH_STRENGTH_FULL; + to -= HASH_STRENGTH_FULL; + HASH_FROM_TO(from, to); + } else if (to <= HASH_STRENGTH_FULL) { + HASH_FROM_TO(from, to); + } else { + HASH_FROM_TO(from, HASH_STRENGTH_FULL); + HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); + } + } else { + if (from <= HASH_STRENGTH_FULL) { + HASH_FROM_DOWN_TO(from, to); + } else if (to >= HASH_STRENGTH_FULL) { + from -= HASH_STRENGTH_FULL; + to -= HASH_STRENGTH_FULL; + HASH_FROM_DOWN_TO(from, to); + } else { + HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); + HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); + } + } + + return hash; +} + +/** + * + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round + * has finished. + * + * return 0 if no page has been scanned since last call, 1 otherwise. + */ +static inline int encode_benefit(void) +{ + u64 scanned_delta, pos_delta, neg_delta; + unsigned long base = benefit.base; + + scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; + + if (!scanned_delta) + return 0; + + scanned_delta >>= base; + pos_delta = rshash_pos >> base; + neg_delta = rshash_neg >> base; + + if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || + CAN_OVERFLOW_U64(benefit.neg, neg_delta) || + CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { + benefit.scanned >>= 1; + benefit.neg >>= 1; + benefit.pos >>= 1; + benefit.base++; + scanned_delta >>= 1; + pos_delta >>= 1; + neg_delta >>= 1; + } + + benefit.pos += pos_delta; + benefit.neg += neg_delta; + benefit.scanned += scanned_delta; + + BUG_ON(!benefit.scanned); + + rshash_pos = rshash_neg = 0; + uksm_pages_scanned_last = uksm_pages_scanned; + + return 1; +} + +static inline void reset_benefit(void) +{ + benefit.pos = 0; + benefit.neg = 0; + benefit.base = 0; + benefit.scanned = 0; +} + +static inline void inc_rshash_pos(unsigned long delta) +{ + if (CAN_OVERFLOW_U64(rshash_pos, delta)) + encode_benefit(); + + rshash_pos += delta; +} + +static inline void inc_rshash_neg(unsigned long delta) +{ + if (CAN_OVERFLOW_U64(rshash_neg, delta)) + encode_benefit(); + + rshash_neg += delta; +} + + +static inline u32 page_hash(struct page *page, unsigned long hash_strength, + int cost_accounting) +{ + u32 val; + unsigned long delta; + + void *addr = kmap_atomic(page); + + val = random_sample_hash(addr, hash_strength); + kunmap_atomic(addr); + + if (cost_accounting) { + if (HASH_STRENGTH_FULL > hash_strength) + delta = HASH_STRENGTH_FULL - hash_strength; + else + delta = 0; + + inc_rshash_pos(delta); + } + + return val; +} + +static int memcmp_pages(struct page *page1, struct page *page2, + int cost_accounting) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1); + addr2 = kmap_atomic(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2); + kunmap_atomic(addr1); + + if (cost_accounting) + inc_rshash_neg(memcmp_cost); + + return ret; +} + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2, 0); +} + +static inline int is_page_full_zero(struct page *page) +{ + char *addr; + int ret; + + addr = kmap_atomic(page); + ret = is_full_zero(addr, PAGE_SIZE); + kunmap_atomic(addr); + + return ret; +} + +static int write_protect_page(struct vm_area_struct *vma, struct page *page, + pte_t *orig_pte, pte_t *old_pte) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr; + pte_t *ptep; + spinlock_t *ptl; + int swapped; + int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + BUG_ON(PageTransCompound(page)); + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + ptep = page_check_address(page, mm, addr, &ptl, 0); + if (!ptep) + goto out_mn; + + if (old_pte) + *old_pte = *ptep; + + if (pte_write(*ptep) || pte_dirty(*ptep)) { + pte_t entry; + + swapped = PageSwapCache(page); + flush_cache_page(vma, addr, page_to_pfn(page)); + /* + * Ok this is tricky, when get_user_pages_fast() run it doesnt + * take any lock, therefore the check that we are going to make + * with the pagecount against the mapcount is racey and + * O_DIRECT can happen right after the check. + * So we clear the pte and flush the tlb before the check + * this assure us that no O_DIRECT can happen after the check + * or in the middle of the check. + */ + entry = ptep_clear_flush_notify(vma, addr, ptep); + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if (page_mapcount(page) + 1 + swapped != page_count(page)) { + set_pte_at(mm, addr, ptep, entry); + goto out_unlock; + } + if (pte_dirty(entry)) + set_page_dirty(page); + entry = pte_mkclean(pte_wrprotect(entry)); + set_pte_at_notify(mm, addr, ptep, entry); + } + *orig_pte = *ptep; + err = 0; + +out_unlock: + pte_unmap_unlock(ptep, ptl); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out: + return err; +} + +#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ +#define MERGE_ERR_COLLI 2 /* there is a collision */ +#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ +#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ + + +/** + * replace_page - replace page in vma by new ksm page + * @vma: vma that holds the pte pointing to page + * @page: the page we are replacing by kpage + * @kpage: the ksm page we replace page by + * @orig_pte: the original value of the pte + * + * Returns 0 on success, MERGE_ERR_PGERR on failure. + */ +static int replace_page(struct vm_area_struct *vma, struct page *page, + struct page *kpage, pte_t orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + pte_t entry; + + unsigned long addr; + int err = MERGE_ERR_PGERR; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + BUG_ON(pmd_trans_huge(*pmd)); + if (!pmd_present(*pmd)) + goto out; + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_same(*ptep, orig_pte)) { + pte_unmap_unlock(ptep, ptl); + goto out_mn; + } + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + entry = mk_pte(kpage, vma->vm_page_prot); + + /* special treatment is needed for zero_page */ + if ((page_to_pfn(kpage) == uksm_zero_pfn) || + (page_to_pfn(kpage) == zero_pfn)) + entry = pte_mkspecial(entry); + else { + get_page(kpage); + page_add_anon_rmap(kpage, vma, addr); + } + + set_pte_at_notify(mm, addr, ptep, entry); + + page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); + put_page(page); + + pte_unmap_unlock(ptep, ptl); + err = 0; +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out: + return err; +} + + +/** + * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The + * zero hash value at HASH_STRENGTH_MAX is used to indicated that its + * hash_max member has not been calculated. + * + * @page The page needs to be hashed + * @hash_old The hash value calculated with current hash strength + * + * return the new hash value calculated at HASH_STRENGTH_MAX + */ +static inline u32 page_hash_max(struct page *page, u32 hash_old) +{ + u32 hash_max = 0; + void *addr; + + addr = kmap_atomic(page); + hash_max = delta_hash(addr, hash_strength, + HASH_STRENGTH_MAX, hash_old); + + kunmap_atomic(addr); + + if (!hash_max) + hash_max = 1; + + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); + return hash_max; +} + +/* + * We compare the hash again, to ensure that it is really a hash collision + * instead of being caused by page write. + */ +static inline int check_collision(struct rmap_item *rmap_item, + u32 hash) +{ + int err; + struct page *page = rmap_item->page; + + /* if this rmap_item has already been hash_maxed, then the collision + * must appears in the second-level rbtree search. In this case we check + * if its hash_max value has been changed. Otherwise, the collision + * happens in the first-level rbtree search, so we check against it's + * current hash value. + */ + if (rmap_item->hash_max) { + inc_rshash_neg(memcmp_cost); + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); + + if (rmap_item->hash_max == page_hash_max(page, hash)) + err = MERGE_ERR_COLLI; + else + err = MERGE_ERR_CHANGED; + } else { + inc_rshash_neg(memcmp_cost + hash_strength); + + if (page_hash(page, hash_strength, 0) == hash) + err = MERGE_ERR_COLLI; + else + err = MERGE_ERR_CHANGED; + } + + return err; +} + +static struct page *page_trans_compound_anon(struct page *page) +{ + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + /* + * head may actually be splitted and freed from under + * us but it's ok here. + */ + if (PageAnon(head)) + return head; + } + return NULL; +} + +static int page_trans_compound_anon_split(struct page *page) +{ + int ret = 0; + struct page *transhuge_head = page_trans_compound_anon(page); + if (transhuge_head) { + /* Get the reference on the head to split it. */ + if (get_page_unless_zero(transhuge_head)) { + /* + * Recheck we got the reference while the head + * was still anonymous. + */ + if (PageAnon(transhuge_head)) + ret = split_huge_page(transhuge_head); + else + /* + * Retry later if split_huge_page run + * from under us. + */ + ret = 1; + put_page(transhuge_head); + } else + /* Retry later if split_huge_page run from under us. */ + ret = 1; + } + return ret; +} + +/** + * Try to merge a rmap_item.page with a kpage in stable node. kpage must + * already be a ksm page. + * + * @return 0 if the pages were merged, -EFAULT otherwise. + */ +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, + struct page *kpage, u32 hash) +{ + struct vm_area_struct *vma = rmap_item->slot->vma; + struct mm_struct *mm = vma->vm_mm; + pte_t orig_pte = __pte(0); + int err = MERGE_ERR_PGERR; + struct page *page; + + if (uksm_test_exit(mm)) + goto out; + + page = rmap_item->page; + + if (page == kpage) { /* ksm page forked */ + err = 0; + goto out; + } + + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); + + if (!PageAnon(page) || !PageKsm(kpage)) + goto out; + + /* + * We need the page lock to read a stable PageSwapCache in + * write_protect_page(). We use trylock_page() instead of + * lock_page() because we don't want to wait here - we + * prefer to continue scanning and merging different pages, + * then come back to this page when it is unlocked. + */ + if (!trylock_page(page)) + goto out; + /* + * If this anonymous page is mapped only here, its pte may need + * to be write-protected. If it's mapped elsewhere, all of its + * ptes are necessarily already write-protected. But in either + * case, we need to lock and check page_count is not raised. + */ + if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { + if (pages_identical(page, kpage)) + err = replace_page(vma, page, kpage, orig_pte); + else + err = check_collision(rmap_item, hash); + } + + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { + munlock_vma_page(page); + if (!PageMlocked(kpage)) { + unlock_page(page); + lock_page(kpage); + mlock_vma_page(kpage); + page = kpage; /* for final unlock */ + } + } + + unlock_page(page); +out: + return err; +} + + + +/** + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance + * to restore a page mapping that has been changed in try_to_merge_two_pages. + * + * @return 0 on success. + */ +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t orig_pte, pte_t wprt_pte) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + + int err = -EFAULT; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_same(*ptep, wprt_pte)) { + /* already copied, let it be */ + pte_unmap_unlock(ptep, ptl); + goto out; + } + + /* + * Good boy, still here. When we still get the ksm page, it does not + * return to the free page pool, there is no way that a pte was changed + * to other page and gets back to this page. And remind that ksm page + * do not reuse in do_wp_page(). So it's safe to restore the original + * pte. + */ + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, orig_pte); + + pte_unmap_unlock(ptep, ptl); + err = 0; +out: + return err; +} + +/** + * try_to_merge_two_pages() - take two identical pages and prepare + * them to be merged into one page(rmap_item->page) + * + * @return 0 if we successfully merged two identical pages into + * one ksm page. MERGE_ERR_COLLI if it's only a hash collision + * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been + * changed since it's hashed. MERGE_ERR_PGERR otherwise. + * + */ +static int try_to_merge_two_pages(struct rmap_item *rmap_item, + struct rmap_item *tree_rmap_item, + u32 hash) +{ + pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); + pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); + struct vm_area_struct *vma1 = rmap_item->slot->vma; + struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; + struct page *page = rmap_item->page; + struct page *tree_page = tree_rmap_item->page; + int err = MERGE_ERR_PGERR; + struct address_space *saved_mapping; + + + if (rmap_item->page == tree_rmap_item->page) + goto out; + + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); + + if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page)) + goto out; + BUG_ON(PageTransCompound(tree_page)); + + if (!PageAnon(page) || !PageAnon(tree_page)) + goto out; + + if (!trylock_page(page)) + goto out; + + + if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { + unlock_page(page); + goto out; + } + + /* + * While we hold page lock, upgrade page from + * PageAnon+anon_vma to PageKsm+NULL stable_node: + * stable_tree_insert() will update stable_node. + */ + saved_mapping = page->mapping; + set_page_stable_node(page, NULL); + mark_page_accessed(page); + unlock_page(page); + + if (!trylock_page(tree_page)) + goto restore_out; + + if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { + unlock_page(tree_page); + goto restore_out; + } + + if (pages_identical(page, tree_page)) { + err = replace_page(vma2, tree_page, page, wprt_pte2); + if (err) { + unlock_page(tree_page); + goto restore_out; + } + + if ((vma2->vm_flags & VM_LOCKED)) { + munlock_vma_page(tree_page); + if (!PageMlocked(page)) { + unlock_page(tree_page); + lock_page(page); + mlock_vma_page(page); + tree_page = page; /* for final unlock */ + } + } + + unlock_page(tree_page); + + goto out; /* success */ + + } else { + if (tree_rmap_item->hash_max && + tree_rmap_item->hash_max == rmap_item->hash_max) { + err = MERGE_ERR_COLLI_MAX; + } else if (page_hash(page, hash_strength, 0) == + page_hash(tree_page, hash_strength, 0)) { + inc_rshash_neg(memcmp_cost + hash_strength * 2); + err = MERGE_ERR_COLLI; + } else { + err = MERGE_ERR_CHANGED; + } + + unlock_page(tree_page); + } + +restore_out: + lock_page(page); + if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), + orig_pte1, wprt_pte1)) + page->mapping = saved_mapping; + + unlock_page(page); +out: + return err; +} + +static inline int hash_cmp(u32 new_val, u32 node_val) +{ + if (new_val > node_val) + return 1; + else if (new_val < node_val) + return -1; + else + return 0; +} + +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) +{ + u32 hash_max = item->hash_max; + + if (!hash_max) { + hash_max = page_hash_max(item->page, hash); + + item->hash_max = hash_max; + } + + return hash_max; +} + + + +/** + * stable_tree_search() - search the stable tree for a page + * + * @item: the rmap_item we are comparing with + * @hash: the hash value of this item->page already calculated + * + * @return the page we have found, NULL otherwise. The page returned has + * been gotten. + */ +static struct page *stable_tree_search(struct rmap_item *item, u32 hash) +{ + struct rb_node *node = root_stable_treep->rb_node; + struct tree_node *tree_node; + unsigned long hash_max; + struct page *page = item->page; + struct stable_node *stable_node; + + stable_node = page_stable_node(page); + if (stable_node) { + /* ksm page forked, that is + * if (PageKsm(page) && !in_stable_tree(rmap_item)) + * it's actually gotten once outside. + */ + get_page(page); + return page; + } + + while (node) { + int cmp; + + tree_node = rb_entry(node, struct tree_node, node); + + cmp = hash_cmp(hash, tree_node->hash); + + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else + break; + } + + if (!node) + return NULL; + + if (tree_node->count == 1) { + stable_node = rb_entry(tree_node->sub_root.rb_node, + struct stable_node, node); + BUG_ON(!stable_node); + + goto get_page_out; + } + + /* + * ok, we have to search the second + * level subtree, hash the page to a + * full strength. + */ + node = tree_node->sub_root.rb_node; + BUG_ON(!node); + hash_max = rmap_item_hash_max(item, hash); + + while (node) { + int cmp; + + stable_node = rb_entry(node, struct stable_node, node); + + cmp = hash_cmp(hash_max, stable_node->hash_max); + + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else + goto get_page_out; + } + + return NULL; + +get_page_out: + page = get_uksm_page(stable_node, 1, 1); + return page; +} + +static int try_merge_rmap_item(struct rmap_item *item, + struct page *kpage, + struct page *tree_page) +{ + spinlock_t *ptl; + pte_t *ptep; + unsigned long addr; + struct vm_area_struct *vma = item->slot->vma; + + addr = get_rmap_addr(item); + ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0); + if (!ptep) + return 0; + + if (pte_write(*ptep)) { + /* has changed, abort! */ + pte_unmap_unlock(ptep, ptl); + return 0; + } + + get_page(tree_page); + page_add_anon_rmap(tree_page, vma, addr); + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(vma->vm_mm, addr, ptep, + mk_pte(tree_page, vma->vm_page_prot)); + + page_remove_rmap(kpage); + put_page(kpage); + + pte_unmap_unlock(ptep, ptl); + + return 1; +} + +/** + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted + * into stable tree, the page was found to be identical to a stable ksm page, + * this is the last chance we can merge them into one. + * + * @item1: the rmap_item holding the page which we wanted to insert + * into stable tree. + * @item2: the other rmap_item we found when unstable tree search + * @oldpage: the page currently mapped by the two rmap_items + * @tree_page: the page we found identical in stable tree node + * @success1: return if item1 is successfully merged + * @success2: return if item2 is successfully merged + */ +static void try_merge_with_stable(struct rmap_item *item1, + struct rmap_item *item2, + struct page **kpage, + struct page *tree_page, + int *success1, int *success2) +{ + struct vm_area_struct *vma1 = item1->slot->vma; + struct vm_area_struct *vma2 = item2->slot->vma; + *success1 = 0; + *success2 = 0; + + if (unlikely(*kpage == tree_page)) { + /* I don't think this can really happen */ + printk(KERN_WARNING "UKSM: unexpected condition detected in " + "try_merge_with_stable() -- *kpage == tree_page !\n"); + *success1 = 1; + *success2 = 1; + return; + } + + if (!PageAnon(*kpage) || !PageKsm(*kpage)) + goto failed; + + if (!trylock_page(tree_page)) + goto failed; + + /* If the oldpage is still ksm and still pointed + * to in the right place, and still write protected, + * we are confident it's not changed, no need to + * memcmp anymore. + * be ware, we cannot take nested pte locks, + * deadlock risk. + */ + if (!try_merge_rmap_item(item1, *kpage, tree_page)) + goto unlock_failed; + + /* ok, then vma2, remind that pte1 already set */ + if (!try_merge_rmap_item(item2, *kpage, tree_page)) + goto success_1; + + *success2 = 1; +success_1: + *success1 = 1; + + + if ((*success1 && vma1->vm_flags & VM_LOCKED) || + (*success2 && vma2->vm_flags & VM_LOCKED)) { + munlock_vma_page(*kpage); + if (!PageMlocked(tree_page)) + mlock_vma_page(tree_page); + } + + /* + * We do not need oldpage any more in the caller, so can break the lock + * now. + */ + unlock_page(*kpage); + *kpage = tree_page; /* Get unlocked outside. */ + return; + +unlock_failed: + unlock_page(tree_page); +failed: + return; +} + +static inline void stable_node_hash_max(struct stable_node *node, + struct page *page, u32 hash) +{ + u32 hash_max = node->hash_max; + + if (!hash_max) { + hash_max = page_hash_max(page, hash); + node->hash_max = hash_max; + } +} + +static inline +struct stable_node *new_stable_node(struct tree_node *tree_node, + struct page *kpage, u32 hash_max) +{ + struct stable_node *new_stable_node; + + new_stable_node = alloc_stable_node(); + if (!new_stable_node) + return NULL; + + new_stable_node->kpfn = page_to_pfn(kpage); + new_stable_node->hash_max = hash_max; + new_stable_node->tree_node = tree_node; + set_page_stable_node(kpage, new_stable_node); + + return new_stable_node; +} + +static inline +struct stable_node *first_level_insert(struct tree_node *tree_node, + struct rmap_item *rmap_item, + struct rmap_item *tree_rmap_item, + struct page **kpage, u32 hash, + int *success1, int *success2) +{ + int cmp; + struct page *tree_page; + u32 hash_max = 0; + struct stable_node *stable_node, *new_snode; + struct rb_node *parent = NULL, **new; + + /* this tree node contains no sub-tree yet */ + stable_node = rb_entry(tree_node->sub_root.rb_node, + struct stable_node, node); + + tree_page = get_uksm_page(stable_node, 1, 0); + if (tree_page) { + cmp = memcmp_pages(*kpage, tree_page, 1); + if (!cmp) { + try_merge_with_stable(rmap_item, tree_rmap_item, kpage, + tree_page, success1, success2); + put_page(tree_page); + if (!*success1 && !*success2) + goto failed; + + return stable_node; + + } else { + /* + * collision in first level try to create a subtree. + * A new node need to be created. + */ + put_page(tree_page); + + stable_node_hash_max(stable_node, tree_page, + tree_node->hash); + hash_max = rmap_item_hash_max(rmap_item, hash); + cmp = hash_cmp(hash_max, stable_node->hash_max); + + parent = &stable_node->node; + if (cmp < 0) { + new = &parent->rb_left; + } else if (cmp > 0) { + new = &parent->rb_right; + } else { + goto failed; + } + } + + } else { + /* the only stable_node deleted, we reuse its tree_node. + */ + parent = NULL; + new = &tree_node->sub_root.rb_node; + } + + new_snode = new_stable_node(tree_node, *kpage, hash_max); + if (!new_snode) + goto failed; + + rb_link_node(&new_snode->node, parent, new); + rb_insert_color(&new_snode->node, &tree_node->sub_root); + tree_node->count++; + *success1 = *success2 = 1; + + return new_snode; + +failed: + return NULL; +} + +static inline +struct stable_node *stable_subtree_insert(struct tree_node *tree_node, + struct rmap_item *rmap_item, + struct rmap_item *tree_rmap_item, + struct page **kpage, u32 hash, + int *success1, int *success2) +{ + struct page *tree_page; + u32 hash_max; + struct stable_node *stable_node, *new_snode; + struct rb_node *parent, **new; + +research: + parent = NULL; + new = &tree_node->sub_root.rb_node; + BUG_ON(!*new); + hash_max = rmap_item_hash_max(rmap_item, hash); + while (*new) { + int cmp; + + stable_node = rb_entry(*new, struct stable_node, node); + + cmp = hash_cmp(hash_max, stable_node->hash_max); + + if (cmp < 0) { + parent = *new; + new = &parent->rb_left; + } else if (cmp > 0) { + parent = *new; + new = &parent->rb_right; + } else { + tree_page = get_uksm_page(stable_node, 1, 0); + if (tree_page) { + cmp = memcmp_pages(*kpage, tree_page, 1); + if (!cmp) { + try_merge_with_stable(rmap_item, + tree_rmap_item, kpage, + tree_page, success1, success2); + + put_page(tree_page); + if (!*success1 && !*success2) + goto failed; + /* + * successfully merged with a stable + * node + */ + return stable_node; + } else { + put_page(tree_page); + goto failed; + } + } else { + /* + * stable node may be deleted, + * and subtree maybe + * restructed, cannot + * continue, research it. + */ + if (tree_node->count) { + goto research; + } else { + /* reuse the tree node*/ + parent = NULL; + new = &tree_node->sub_root.rb_node; + } + } + } + } + + new_snode = new_stable_node(tree_node, *kpage, hash_max); + if (!new_snode) + goto failed; + + rb_link_node(&new_snode->node, parent, new); + rb_insert_color(&new_snode->node, &tree_node->sub_root); + tree_node->count++; + *success1 = *success2 = 1; + + return new_snode; + +failed: + return NULL; +} + + +/** + * stable_tree_insert() - try to insert a merged page in unstable tree to + * the stable tree + * + * @kpage: the page need to be inserted + * @hash: the current hash of this page + * @rmap_item: the rmap_item being scanned + * @tree_rmap_item: the rmap_item found on unstable tree + * @success1: return if rmap_item is merged + * @success2: return if tree_rmap_item is merged + * + * @return the stable_node on stable tree if at least one + * rmap_item is inserted into stable tree, NULL + * otherwise. + */ +static struct stable_node * +stable_tree_insert(struct page **kpage, u32 hash, + struct rmap_item *rmap_item, + struct rmap_item *tree_rmap_item, + int *success1, int *success2) +{ + struct rb_node **new = &root_stable_treep->rb_node; + struct rb_node *parent = NULL; + struct stable_node *stable_node; + struct tree_node *tree_node; + u32 hash_max = 0; + + *success1 = *success2 = 0; + + while (*new) { + int cmp; + + tree_node = rb_entry(*new, struct tree_node, node); + + cmp = hash_cmp(hash, tree_node->hash); + + if (cmp < 0) { + parent = *new; + new = &parent->rb_left; + } else if (cmp > 0) { + parent = *new; + new = &parent->rb_right; + } else + break; + } + + if (*new) { + if (tree_node->count == 1) { + stable_node = first_level_insert(tree_node, rmap_item, + tree_rmap_item, kpage, + hash, success1, success2); + } else { + stable_node = stable_subtree_insert(tree_node, + rmap_item, tree_rmap_item, kpage, + hash, success1, success2); + } + } else { + + /* no tree node found */ + tree_node = alloc_tree_node(stable_tree_node_listp); + if (!tree_node) { + stable_node = NULL; + goto out; + } + + stable_node = new_stable_node(tree_node, *kpage, hash_max); + if (!stable_node) { + free_tree_node(tree_node); + goto out; + } + + tree_node->hash = hash; + rb_link_node(&tree_node->node, parent, new); + rb_insert_color(&tree_node->node, root_stable_treep); + parent = NULL; + new = &tree_node->sub_root.rb_node; + + rb_link_node(&stable_node->node, parent, new); + rb_insert_color(&stable_node->node, &tree_node->sub_root); + tree_node->count++; + *success1 = *success2 = 1; + } + +out: + return stable_node; +} + + +/** + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem + * + * @return 0 on success, -EBUSY if unable to lock the mmap_sem, + * -EINVAL if the page mapping has been changed. + */ +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) +{ + int err; + + err = get_mergeable_page_lock_mmap(tree_rmap_item); + + if (err == -EINVAL) { + /* its page map has been changed, remove it */ + remove_rmap_item_from_tree(tree_rmap_item); + } + + /* The page is gotten and mmap_sem is locked now. */ + return err; +} + + +/** + * unstable_tree_search_insert() - search an unstable tree rmap_item with the + * same hash value. Get its page and trylock the mmap_sem + */ +static inline +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, + u32 hash) + +{ + struct rb_node **new = &root_unstable_tree.rb_node; + struct rb_node *parent = NULL; + struct tree_node *tree_node; + u32 hash_max; + struct rmap_item *tree_rmap_item; + + while (*new) { + int cmp; + + tree_node = rb_entry(*new, struct tree_node, node); + + cmp = hash_cmp(hash, tree_node->hash); + + if (cmp < 0) { + parent = *new; + new = &parent->rb_left; + } else if (cmp > 0) { + parent = *new; + new = &parent->rb_right; + } else + break; + } + + if (*new) { + /* got the tree_node */ + if (tree_node->count == 1) { + tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, + struct rmap_item, node); + BUG_ON(!tree_rmap_item); + + goto get_page_out; + } + + /* well, search the collision subtree */ + new = &tree_node->sub_root.rb_node; + BUG_ON(!*new); + hash_max = rmap_item_hash_max(rmap_item, hash); + + while (*new) { + int cmp; + + tree_rmap_item = rb_entry(*new, struct rmap_item, + node); + + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); + parent = *new; + if (cmp < 0) + new = &parent->rb_left; + else if (cmp > 0) + new = &parent->rb_right; + else + goto get_page_out; + } + } else { + /* alloc a new tree_node */ + tree_node = alloc_tree_node(&unstable_tree_node_list); + if (!tree_node) + return NULL; + + tree_node->hash = hash; + rb_link_node(&tree_node->node, parent, new); + rb_insert_color(&tree_node->node, &root_unstable_tree); + parent = NULL; + new = &tree_node->sub_root.rb_node; + } + + /* did not found even in sub-tree */ + rmap_item->tree_node = tree_node; + rmap_item->address |= UNSTABLE_FLAG; + rmap_item->hash_round = uksm_hash_round; + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, &tree_node->sub_root); + + uksm_pages_unshared++; + return NULL; + +get_page_out: + if (tree_rmap_item->page == rmap_item->page) + return NULL; + + if (get_tree_rmap_item_page(tree_rmap_item)) + return NULL; + + return tree_rmap_item; +} + +static void hold_anon_vma(struct rmap_item *rmap_item, + struct anon_vma *anon_vma) +{ + rmap_item->anon_vma = anon_vma; + get_anon_vma(anon_vma); +} + + +/** + * stable_tree_append() - append a rmap_item to a stable node. Deduplication + * ratio statistics is done in this function. + * + */ +static void stable_tree_append(struct rmap_item *rmap_item, + struct stable_node *stable_node, int logdedup) +{ + struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; + unsigned long key = (unsigned long)rmap_item->slot; + unsigned long factor = rmap_item->slot->rung->step; + + BUG_ON(!stable_node); + rmap_item->address |= STABLE_FLAG; + + if (hlist_empty(&stable_node->hlist)) { + uksm_pages_shared++; + goto node_vma_new; + } else { + uksm_pages_sharing++; + } + + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { + if (node_vma->key >= key) + break; + + if (logdedup) { + node_vma->slot->pages_bemerged += factor; + if (list_empty(&node_vma->slot->dedup_list)) + list_add(&node_vma->slot->dedup_list, + &vma_slot_dedup); + } + } + + if (node_vma) { + if (node_vma->key == key) { + node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); + goto node_vma_ok; + } else if (node_vma->key > key) { + node_vma_cont = node_vma; + } + } + +node_vma_new: + /* no same vma already in node, alloc a new node_vma */ + new_node_vma = alloc_node_vma(); + BUG_ON(!new_node_vma); + new_node_vma->head = stable_node; + new_node_vma->slot = rmap_item->slot; + + if (!node_vma) { + hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); + } else if (node_vma->key != key) { + if (node_vma->key < key) + hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); + else { + hlist_add_before(&new_node_vma->hlist, + &node_vma->hlist); + } + + } + node_vma = new_node_vma; + +node_vma_ok: /* ok, ready to add to the list */ + rmap_item->head = node_vma; + hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); + hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); + if (logdedup) { + rmap_item->slot->pages_merged++; + if (node_vma_cont) { + node_vma = node_vma_cont; + hlist_for_each_entry_continue(node_vma, hlist) { + node_vma->slot->pages_bemerged += factor; + if (list_empty(&node_vma->slot->dedup_list)) + list_add(&node_vma->slot->dedup_list, + &vma_slot_dedup); + } + } + } +} + +/* + * We use break_ksm to break COW on a ksm page: it's a stripped down + * + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) + * put_page(page); + * + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * in case the application has unmapped and remapped mm,addr meanwhile. + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + */ +static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +{ + struct page *page; + int ret = 0; + + do { + cond_resched(); + page = follow_page(vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) + break; + if (PageKsm(page)) { + ret = handle_mm_fault(vma->vm_mm, vma, addr, + FAULT_FLAG_WRITE); + } else + ret = VM_FAULT_WRITE; + put_page(page); + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); + /* + * We must loop because handle_mm_fault() may back out if there's + * any difficulty e.g. if pte accessed bit gets updated concurrently. + * + * VM_FAULT_WRITE is what we have been hoping for: it indicates that + * COW has been broken, even if the vma does not permit VM_WRITE; + * but note that a concurrent fault might break PageKsm for us. + * + * VM_FAULT_SIGBUS could occur if we race with truncation of the + * backing file, which also invalidates anonymous pages: that's + * okay, that truncation will have unmapped the PageKsm for us. + * + * VM_FAULT_OOM: at the time of writing (late July 2009), setting + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the + * current task has TIF_MEMDIE set, and will be OOM killed on return + * to user; and ksmd, having no mm, would never be chosen for that. + * + * But if the mm is in a limited mem_cgroup, then the fault may fail + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and + * even ksmd can fail in this way - though it's usually breaking ksm + * just to undo a merge it made a moment before, so unlikely to oom. + * + * That's a pity: we might therefore have more kernel pages allocated + * than we're counting as nodes in the stable tree; but uksm_do_scan + * will retry to break_cow on each pass, so should recover the page + * in due course. The important thing is to not let VM_MERGEABLE + * be cleared while any such pages might remain in the area. + */ + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; +} + +static void break_cow(struct rmap_item *rmap_item) +{ + struct vm_area_struct *vma = rmap_item->slot->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = get_rmap_addr(rmap_item); + + if (uksm_test_exit(mm)) + goto out; + + break_ksm(vma, addr); +out: + return; +} + +/* + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather + * than check every pte of a given vma, the locking doesn't quite work for + * that - an rmap_item is assigned to the stable tree after inserting ksm + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing + * rmap_items from parent to child at fork time (so as not to waste time + * if exit comes before the next scan reaches it). + * + * Similarly, although we'd like to remove rmap_items (so updating counts + * and freeing memory) when unmerging an area, it's easier to leave that + * to the next pass of ksmd - consider, for example, how ksmd might be + * in cmp_and_merge_page on one of the rmap_items we would be removing. + */ +inline int unmerge_uksm_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + unsigned long addr; + int err = 0; + + for (addr = start; addr < end && !err; addr += PAGE_SIZE) { + if (uksm_test_exit(vma->vm_mm)) + break; + if (signal_pending(current)) + err = -ERESTARTSYS; + else + err = break_ksm(vma, addr); + } + return err; +} + +static inline void inc_uksm_pages_scanned(void) +{ + u64 delta; + + + if (uksm_pages_scanned == U64_MAX) { + encode_benefit(); + + delta = uksm_pages_scanned >> pages_scanned_base; + + if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { + pages_scanned_stored >>= 1; + delta >>= 1; + pages_scanned_base++; + } + + pages_scanned_stored += delta; + + uksm_pages_scanned = uksm_pages_scanned_last = 0; + } + + uksm_pages_scanned++; +} + +static inline int find_zero_page_hash(int strength, u32 hash) +{ + return (zero_hash_table[strength] == hash); +} + +static +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) +{ + struct page *zero_page = empty_uksm_zero_page; + struct mm_struct *mm = vma->vm_mm; + pte_t orig_pte = __pte(0); + int err = -EFAULT; + + if (uksm_test_exit(mm)) + goto out; + + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); + + if (!PageAnon(page)) + goto out; + + if (!trylock_page(page)) + goto out; + + if (write_protect_page(vma, page, &orig_pte, 0) == 0) { + if (is_page_full_zero(page)) + err = replace_page(vma, page, zero_page, orig_pte); + } + + unlock_page(page); +out: + return err; +} + +/* + * cmp_and_merge_page() - first see if page can be merged into the stable + * tree; if not, compare hash to previous and if it's the same, see if page + * can be inserted into the unstable tree, or merged with a page already there + * and both transferred to the stable tree. + * + * @page: the page that we are searching identical page to. + * @rmap_item: the reverse mapping into the virtual address of this page + */ +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) +{ + struct rmap_item *tree_rmap_item; + struct page *page; + struct page *kpage = NULL; + u32 hash_max; + int err; + unsigned int success1, success2; + struct stable_node *snode; + int cmp; + struct rb_node *parent = NULL, **new; + + remove_rmap_item_from_tree(rmap_item); + page = rmap_item->page; + + /* We first start with searching the page inside the stable tree */ + kpage = stable_tree_search(rmap_item, hash); + if (kpage) { + err = try_to_merge_with_uksm_page(rmap_item, kpage, + hash); + if (!err) { + /* + * The page was successfully merged, add + * its rmap_item to the stable tree. + * page lock is needed because it's + * racing with try_to_unmap_ksm(), etc. + */ + lock_page(kpage); + snode = page_stable_node(kpage); + stable_tree_append(rmap_item, snode, 1); + unlock_page(kpage); + put_page(kpage); + return; /* success */ + } + put_page(kpage); + + /* + * if it's a collision and it has been search in sub-rbtree + * (hash_max != 0), we want to abort, because if it is + * successfully merged in unstable tree, the collision trends to + * happen again. + */ + if (err == MERGE_ERR_COLLI && rmap_item->hash_max) + return; + } + + tree_rmap_item = + unstable_tree_search_insert(rmap_item, hash); + if (tree_rmap_item) { + err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); + /* + * As soon as we merge this page, we want to remove the + * rmap_item of the page we have merged with from the unstable + * tree, and insert it instead as new node in the stable tree. + */ + if (!err) { + kpage = page; + remove_rmap_item_from_tree(tree_rmap_item); + lock_page(kpage); + snode = stable_tree_insert(&kpage, hash, + rmap_item, tree_rmap_item, + &success1, &success2); + + /* + * Do not log dedup for tree item, it's not counted as + * scanned in this round. + */ + if (success2) + stable_tree_append(tree_rmap_item, snode, 0); + + /* + * The order of these two stable append is important: + * we are scanning rmap_item. + */ + if (success1) + stable_tree_append(rmap_item, snode, 1); + + /* + * The original kpage may be unlocked inside + * stable_tree_insert() already. This page + * should be unlocked before doing + * break_cow(). + */ + unlock_page(kpage); + + if (!success1) + break_cow(rmap_item); + + if (!success2) + break_cow(tree_rmap_item); + + } else if (err == MERGE_ERR_COLLI) { + BUG_ON(tree_rmap_item->tree_node->count > 1); + + rmap_item_hash_max(tree_rmap_item, + tree_rmap_item->tree_node->hash); + + hash_max = rmap_item_hash_max(rmap_item, hash); + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); + parent = &tree_rmap_item->node; + if (cmp < 0) + new = &parent->rb_left; + else if (cmp > 0) + new = &parent->rb_right; + else + goto put_up_out; + + rmap_item->tree_node = tree_rmap_item->tree_node; + rmap_item->address |= UNSTABLE_FLAG; + rmap_item->hash_round = uksm_hash_round; + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, + &tree_rmap_item->tree_node->sub_root); + rmap_item->tree_node->count++; + } else { + /* + * either one of the page has changed or they collide + * at the max hash, we consider them as ill items. + */ + remove_rmap_item_from_tree(tree_rmap_item); + } +put_up_out: + put_page(tree_rmap_item->page); + up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); + } +} + + + + +static inline unsigned long get_pool_index(struct vma_slot *slot, + unsigned long index) +{ + unsigned long pool_index; + + pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; + if (pool_index >= slot->pool_size) + BUG(); + return pool_index; +} + +static inline unsigned long index_page_offset(unsigned long index) +{ + return offset_in_page(sizeof(struct rmap_list_entry *) * index); +} + +static inline +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, + unsigned long index, int need_alloc) +{ + unsigned long pool_index; + struct page *page; + void *addr; + + + pool_index = get_pool_index(slot, index); + if (!slot->rmap_list_pool[pool_index]) { + if (!need_alloc) + return NULL; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); + if (!page) + return NULL; + + slot->rmap_list_pool[pool_index] = page; + } + + addr = kmap(slot->rmap_list_pool[pool_index]); + addr += index_page_offset(index); + + return addr; +} + +static inline void put_rmap_list_entry(struct vma_slot *slot, + unsigned long index) +{ + unsigned long pool_index; + + pool_index = get_pool_index(slot, index); + BUG_ON(!slot->rmap_list_pool[pool_index]); + kunmap(slot->rmap_list_pool[pool_index]); +} + +static inline int entry_is_new(struct rmap_list_entry *entry) +{ + return !entry->item; +} + +static inline unsigned long get_index_orig_addr(struct vma_slot *slot, + unsigned long index) +{ + return slot->vma->vm_start + (index << PAGE_SHIFT); +} + +static inline unsigned long get_entry_address(struct rmap_list_entry *entry) +{ + unsigned long addr; + + if (is_addr(entry->addr)) + addr = get_clean_addr(entry->addr); + else if (entry->item) + addr = get_rmap_addr(entry->item); + else + BUG(); + + return addr; +} + +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) +{ + if (is_addr(entry->addr)) + return NULL; + + return entry->item; +} + +static inline void inc_rmap_list_pool_count(struct vma_slot *slot, + unsigned long index) +{ + unsigned long pool_index; + + pool_index = get_pool_index(slot, index); + BUG_ON(!slot->rmap_list_pool[pool_index]); + slot->pool_counts[pool_index]++; +} + +static inline void dec_rmap_list_pool_count(struct vma_slot *slot, + unsigned long index) +{ + unsigned long pool_index; + + pool_index = get_pool_index(slot, index); + BUG_ON(!slot->rmap_list_pool[pool_index]); + BUG_ON(!slot->pool_counts[pool_index]); + slot->pool_counts[pool_index]--; +} + +static inline int entry_has_rmap(struct rmap_list_entry *entry) +{ + return !is_addr(entry->addr) && entry->item; +} + +static inline void swap_entries(struct rmap_list_entry *entry1, + unsigned long index1, + struct rmap_list_entry *entry2, + unsigned long index2) +{ + struct rmap_list_entry tmp; + + /* swapping two new entries is meaningless */ + BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); + + tmp = *entry1; + *entry1 = *entry2; + *entry2 = tmp; + + if (entry_has_rmap(entry1)) + entry1->item->entry_index = index1; + + if (entry_has_rmap(entry2)) + entry2->item->entry_index = index2; + + if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { + inc_rmap_list_pool_count(entry1->item->slot, index1); + dec_rmap_list_pool_count(entry1->item->slot, index2); + } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { + inc_rmap_list_pool_count(entry2->item->slot, index2); + dec_rmap_list_pool_count(entry2->item->slot, index1); + } +} + +static inline void free_entry_item(struct rmap_list_entry *entry) +{ + unsigned long index; + struct rmap_item *item; + + if (!is_addr(entry->addr)) { + BUG_ON(!entry->item); + item = entry->item; + entry->addr = get_rmap_addr(item); + set_is_addr(entry->addr); + index = item->entry_index; + remove_rmap_item_from_tree(item); + dec_rmap_list_pool_count(item->slot, index); + free_rmap_item(item); + } +} + +static inline int pool_entry_boundary(unsigned long index) +{ + unsigned long linear_addr; + + linear_addr = sizeof(struct rmap_list_entry *) * index; + return index && !offset_in_page(linear_addr); +} + +static inline void try_free_last_pool(struct vma_slot *slot, + unsigned long index) +{ + unsigned long pool_index; + + pool_index = get_pool_index(slot, index); + if (slot->rmap_list_pool[pool_index] && + !slot->pool_counts[pool_index]) { + __free_page(slot->rmap_list_pool[pool_index]); + slot->rmap_list_pool[pool_index] = NULL; + slot->flags |= UKSM_SLOT_NEED_SORT; + } + +} + +static inline unsigned long vma_item_index(struct vm_area_struct *vma, + struct rmap_item *item) +{ + return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; +} + +static int within_same_pool(struct vma_slot *slot, + unsigned long i, unsigned long j) +{ + unsigned long pool_i, pool_j; + + pool_i = get_pool_index(slot, i); + pool_j = get_pool_index(slot, j); + + return (pool_i == pool_j); +} + +static void sort_rmap_entry_list(struct vma_slot *slot) +{ + unsigned long i, j; + struct rmap_list_entry *entry, *swap_entry; + + entry = get_rmap_list_entry(slot, 0, 0); + for (i = 0; i < slot->pages; ) { + + if (!entry) + goto skip_whole_pool; + + if (entry_is_new(entry)) + goto next_entry; + + if (is_addr(entry->addr)) { + entry->addr = 0; + goto next_entry; + } + + j = vma_item_index(slot->vma, entry->item); + if (j == i) + goto next_entry; + + if (within_same_pool(slot, i, j)) + swap_entry = entry + j - i; + else + swap_entry = get_rmap_list_entry(slot, j, 1); + + swap_entries(entry, i, swap_entry, j); + if (!within_same_pool(slot, i, j)) + put_rmap_list_entry(slot, j); + continue; + +skip_whole_pool: + i += PAGE_SIZE / sizeof(*entry); + if (i < slot->pages) + entry = get_rmap_list_entry(slot, i, 0); + continue; + +next_entry: + if (i >= slot->pages - 1 || + !within_same_pool(slot, i, i + 1)) { + put_rmap_list_entry(slot, i); + if (i + 1 < slot->pages) + entry = get_rmap_list_entry(slot, i + 1, 0); + } else + entry++; + i++; + continue; + } + + /* free empty pool entries which contain no rmap_item */ + /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ + for (i = 0; i < slot->pool_size; i++) { + unsigned char has_rmap; + void *addr; + + if (!slot->rmap_list_pool[i]) + continue; + + has_rmap = 0; + addr = kmap(slot->rmap_list_pool[i]); + BUG_ON(!addr); + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { + entry = (struct rmap_list_entry *)addr + j; + if (is_addr(entry->addr)) + continue; + if (!entry->item) + continue; + has_rmap = 1; + } + kunmap(slot->rmap_list_pool[i]); + if (!has_rmap) { + BUG_ON(slot->pool_counts[i]); + __free_page(slot->rmap_list_pool[i]); + slot->rmap_list_pool[i] = NULL; + } + } + + slot->flags &= ~UKSM_SLOT_NEED_SORT; +} + +/* + * vma_fully_scanned() - if all the pages in this slot have been scanned. + */ +static inline int vma_fully_scanned(struct vma_slot *slot) +{ + return slot->pages_scanned == slot->pages; +} + +/** + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to + * its random permutation. This function is embedded with the random + * permutation index management code. + */ +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) +{ + unsigned long rand_range, addr, swap_index, scan_index; + struct rmap_item *item = NULL; + struct rmap_list_entry *scan_entry, *swap_entry = NULL; + struct page *page; + + scan_index = swap_index = slot->pages_scanned % slot->pages; + + if (pool_entry_boundary(scan_index)) + try_free_last_pool(slot, scan_index - 1); + + if (vma_fully_scanned(slot)) { + if (slot->flags & UKSM_SLOT_NEED_SORT) + slot->flags |= UKSM_SLOT_NEED_RERAND; + else + slot->flags &= ~UKSM_SLOT_NEED_RERAND; + if (slot->flags & UKSM_SLOT_NEED_SORT) + sort_rmap_entry_list(slot); + } + + scan_entry = get_rmap_list_entry(slot, scan_index, 1); + if (!scan_entry) + return NULL; + + if (entry_is_new(scan_entry)) { + scan_entry->addr = get_index_orig_addr(slot, scan_index); + set_is_addr(scan_entry->addr); + } + + if (slot->flags & UKSM_SLOT_NEED_RERAND) { + rand_range = slot->pages - scan_index; + BUG_ON(!rand_range); + swap_index = scan_index + (prandom_u32() % rand_range); + } + + if (swap_index != scan_index) { + swap_entry = get_rmap_list_entry(slot, swap_index, 1); + if (entry_is_new(swap_entry)) { + swap_entry->addr = get_index_orig_addr(slot, + swap_index); + set_is_addr(swap_entry->addr); + } + swap_entries(scan_entry, scan_index, swap_entry, swap_index); + } + + addr = get_entry_address(scan_entry); + item = get_entry_item(scan_entry); + BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); + + page = follow_page(slot->vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) + goto nopage; + + if (!PageAnon(page) && !page_trans_compound_anon(page)) + goto putpage; + + /*check is zero_page pfn or uksm_zero_page*/ + if ((page_to_pfn(page) == zero_pfn) + || (page_to_pfn(page) == uksm_zero_pfn)) + goto putpage; + + flush_anon_page(slot->vma, page, addr); + flush_dcache_page(page); + + + *hash = page_hash(page, hash_strength, 1); + inc_uksm_pages_scanned(); + /*if the page content all zero, re-map to zero-page*/ + if (find_zero_page_hash(hash_strength, *hash)) { + if (!cmp_and_merge_zero_page(slot->vma, page)) { + slot->pages_merged++; + inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); + dec_mm_counter(slot->mm, MM_ANONPAGES); + + /* For full-zero pages, no need to create rmap item */ + goto putpage; + } else { + inc_rshash_neg(memcmp_cost / 2); + } + } + + if (!item) { + item = alloc_rmap_item(); + if (item) { + /* It has already been zeroed */ + item->slot = slot; + item->address = addr; + item->entry_index = scan_index; + scan_entry->item = item; + inc_rmap_list_pool_count(slot, scan_index); + } else + goto putpage; + } + + BUG_ON(item->slot != slot); + /* the page may have changed */ + item->page = page; + put_rmap_list_entry(slot, scan_index); + if (swap_entry) + put_rmap_list_entry(slot, swap_index); + return item; + +putpage: + put_page(page); + page = NULL; +nopage: + /* no page, store addr back and free rmap_item if possible */ + free_entry_item(scan_entry); + put_rmap_list_entry(slot, scan_index); + if (swap_entry) + put_rmap_list_entry(slot, swap_index); + return NULL; +} + +static inline int in_stable_tree(struct rmap_item *rmap_item) +{ + return rmap_item->address & STABLE_FLAG; +} + +/** + * scan_vma_one_page() - scan the next page in a vma_slot. Called with + * mmap_sem locked. + */ +static noinline void scan_vma_one_page(struct vma_slot *slot) +{ + u32 hash; + struct mm_struct *mm; + struct rmap_item *rmap_item = NULL; + struct vm_area_struct *vma = slot->vma; + + mm = vma->vm_mm; + BUG_ON(!mm); + BUG_ON(!slot); + + rmap_item = get_next_rmap_item(slot, &hash); + if (!rmap_item) + goto out1; + + if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) + goto out2; + + cmp_and_merge_page(rmap_item, hash); +out2: + put_page(rmap_item->page); +out1: + slot->pages_scanned++; + if (slot->fully_scanned_round != fully_scanned_round) + scanned_virtual_pages++; + + if (vma_fully_scanned(slot)) + slot->fully_scanned_round = fully_scanned_round; +} + +static inline unsigned long rung_get_pages(struct scan_rung *rung) +{ + struct slot_tree_node *node; + + if (!rung->vma_root.rnode) + return 0; + + node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); + + return node->size; +} + +#define RUNG_SAMPLED_MIN 3 + +static inline +void uksm_calc_rung_step(struct scan_rung *rung, + unsigned long page_time, unsigned long ratio) +{ + unsigned long sampled, pages; + + /* will be fully scanned ? */ + if (!rung->cover_msecs) { + rung->step = 1; + return; + } + + sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) + * ratio / page_time; + + /* + * Before we finsish a scan round and expensive per-round jobs, + * we need to have a chance to estimate the per page time. So + * the sampled number can not be too small. + */ + if (sampled < RUNG_SAMPLED_MIN) + sampled = RUNG_SAMPLED_MIN; + + pages = rung_get_pages(rung); + if (likely(pages > sampled)) + rung->step = pages / sampled; + else + rung->step = 1; +} + +static inline int step_need_recalc(struct scan_rung *rung) +{ + unsigned long pages, stepmax; + + pages = rung_get_pages(rung); + stepmax = pages / RUNG_SAMPLED_MIN; + + return pages && (rung->step > pages || + (stepmax && rung->step > stepmax)); +} + +static inline +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) +{ + struct vma_slot *slot; + + if (finished) + rung->flags |= UKSM_RUNG_ROUND_FINISHED; + + if (step_recalc || step_need_recalc(rung)) { + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); + BUG_ON(step_need_recalc(rung)); + } + + slot_iter_index = prandom_u32() % rung->step; + BUG_ON(!rung->vma_root.rnode); + slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); + BUG_ON(!slot); + + rung->current_scan = slot; + rung->current_offset = slot_iter_index; +} + +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) +{ + return &slot->rung->vma_root; +} + +/* + * return if resetted. + */ +static int advance_current_scan(struct scan_rung *rung) +{ + unsigned short n; + struct vma_slot *slot, *next = NULL; + + BUG_ON(!rung->vma_root.num); + + slot = rung->current_scan; + n = (slot->pages - rung->current_offset) % rung->step; + slot_iter_index = rung->step - n; + next = sradix_tree_next(&rung->vma_root, slot->snode, + slot->sindex, slot_iter); + + if (next) { + rung->current_offset = slot_iter_index; + rung->current_scan = next; + return 0; + } else { + reset_current_scan(rung, 1, 0); + return 1; + } +} + +static inline void rung_rm_slot(struct vma_slot *slot) +{ + struct scan_rung *rung = slot->rung; + struct sradix_tree_root *root; + + if (rung->current_scan == slot) + advance_current_scan(rung); + + root = slot_get_root(slot); + sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); + slot->snode = NULL; + if (step_need_recalc(rung)) { + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); + BUG_ON(step_need_recalc(rung)); + } + + /* In case advance_current_scan loop back to this slot again */ + if (rung->vma_root.num && rung->current_scan == slot) + reset_current_scan(slot->rung, 1, 0); +} + +static inline void rung_add_new_slots(struct scan_rung *rung, + struct vma_slot **slots, unsigned long num) +{ + int err; + struct vma_slot *slot; + unsigned long i; + struct sradix_tree_root *root = &rung->vma_root; + + err = sradix_tree_enter(root, (void **)slots, num); + BUG_ON(err); + + for (i = 0; i < num; i++) { + slot = slots[i]; + slot->rung = rung; + BUG_ON(vma_fully_scanned(slot)); + } + + if (rung->vma_root.num == num) + reset_current_scan(rung, 0, 1); +} + +static inline int rung_add_one_slot(struct scan_rung *rung, + struct vma_slot *slot) +{ + int err; + + err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); + if (err) + return err; + + slot->rung = rung; + if (rung->vma_root.num == 1) + reset_current_scan(rung, 0, 1); + + return 0; +} + +/* + * Return true if the slot is deleted from its rung. + */ +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) +{ + struct scan_rung *old_rung = slot->rung; + int err; + + if (old_rung == rung) + return 0; + + rung_rm_slot(slot); + err = rung_add_one_slot(rung, slot); + if (err) { + err = rung_add_one_slot(old_rung, slot); + WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ + } + + return 1; +} + +static inline int vma_rung_up(struct vma_slot *slot) +{ + struct scan_rung *rung; + + rung = slot->rung; + if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) + rung++; + + return vma_rung_enter(slot, rung); +} + +static inline int vma_rung_down(struct vma_slot *slot) +{ + struct scan_rung *rung; + + rung = slot->rung; + if (slot->rung != &uksm_scan_ladder[0]) + rung--; + + return vma_rung_enter(slot, rung); +} + +/** + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. + */ +static unsigned long cal_dedup_ratio(struct vma_slot *slot) +{ + unsigned long ret; + + BUG_ON(slot->pages_scanned == slot->last_scanned); + + ret = slot->pages_merged; + + /* Thrashing area filtering */ + if (ret && uksm_thrash_threshold) { + if (slot->pages_cowed * 100 / slot->pages_merged + > uksm_thrash_threshold) { + ret = 0; + } else { + ret = slot->pages_merged - slot->pages_cowed; + } + } + + return ret; +} + +/** + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. + */ +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) +{ + unsigned long ret; + unsigned long pages_scanned; + + pages_scanned = slot->pages_scanned; + if (!pages_scanned) { + if (uksm_thrash_threshold) + return 0; + else + pages_scanned = slot->pages_scanned; + } + + ret = slot->pages_bemerged * 100 / pages_scanned; + + /* Thrashing area filtering */ + if (ret && uksm_thrash_threshold) { + if (slot->pages_cowed * 100 / slot->pages_bemerged + > uksm_thrash_threshold) { + ret = 0; + } else { + ret = slot->pages_bemerged - slot->pages_cowed; + } + } + + return ret; +} + +/** + * stable_node_reinsert() - When the hash_strength has been adjusted, the + * stable tree need to be restructured, this is the function re-inserting the + * stable node. + */ +static inline void stable_node_reinsert(struct stable_node *new_node, + struct page *page, + struct rb_root *root_treep, + struct list_head *tree_node_listp, + u32 hash) +{ + struct rb_node **new = &root_treep->rb_node; + struct rb_node *parent = NULL; + struct stable_node *stable_node; + struct tree_node *tree_node; + struct page *tree_page; + int cmp; + + while (*new) { + int cmp; + + tree_node = rb_entry(*new, struct tree_node, node); + + cmp = hash_cmp(hash, tree_node->hash); + + if (cmp < 0) { + parent = *new; + new = &parent->rb_left; + } else if (cmp > 0) { + parent = *new; + new = &parent->rb_right; + } else + break; + } + + if (*new) { + /* find a stable tree node with same first level hash value */ + stable_node_hash_max(new_node, page, hash); + if (tree_node->count == 1) { + stable_node = rb_entry(tree_node->sub_root.rb_node, + struct stable_node, node); + tree_page = get_uksm_page(stable_node, 1, 0); + if (tree_page) { + stable_node_hash_max(stable_node, + tree_page, hash); + put_page(tree_page); + + /* prepare for stable node insertion */ + + cmp = hash_cmp(new_node->hash_max, + stable_node->hash_max); + parent = &stable_node->node; + if (cmp < 0) + new = &parent->rb_left; + else if (cmp > 0) + new = &parent->rb_right; + else + goto failed; + + goto add_node; + } else { + /* the only stable_node deleted, the tree node + * was not deleted. + */ + goto tree_node_reuse; + } + } + + /* well, search the collision subtree */ + new = &tree_node->sub_root.rb_node; + parent = NULL; + BUG_ON(!*new); + while (*new) { + int cmp; + + stable_node = rb_entry(*new, struct stable_node, node); + + cmp = hash_cmp(new_node->hash_max, + stable_node->hash_max); + + if (cmp < 0) { + parent = *new; + new = &parent->rb_left; + } else if (cmp > 0) { + parent = *new; + new = &parent->rb_right; + } else { + /* oh, no, still a collision */ + goto failed; + } + } + + goto add_node; + } + + /* no tree node found */ + tree_node = alloc_tree_node(tree_node_listp); + if (!tree_node) { + printk(KERN_ERR "UKSM: memory allocation error!\n"); + goto failed; + } else { + tree_node->hash = hash; + rb_link_node(&tree_node->node, parent, new); + rb_insert_color(&tree_node->node, root_treep); + +tree_node_reuse: + /* prepare for stable node insertion */ + parent = NULL; + new = &tree_node->sub_root.rb_node; + } + +add_node: + rb_link_node(&new_node->node, parent, new); + rb_insert_color(&new_node->node, &tree_node->sub_root); + new_node->tree_node = tree_node; + tree_node->count++; + return; + +failed: + /* This can only happen when two nodes have collided + * in two levels. + */ + new_node->tree_node = NULL; + return; +} + +static inline void free_all_tree_nodes(struct list_head *list) +{ + struct tree_node *node, *tmp; + + list_for_each_entry_safe(node, tmp, list, all_list) { + free_tree_node(node); + } +} + +/** + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash + * strength to the current hash_strength. It re-structures the hole tree. + */ +static inline void stable_tree_delta_hash(u32 prev_hash_strength) +{ + struct stable_node *node, *tmp; + struct rb_root *root_new_treep; + struct list_head *new_tree_node_listp; + + stable_tree_index = (stable_tree_index + 1) % 2; + root_new_treep = &root_stable_tree[stable_tree_index]; + new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; + *root_new_treep = RB_ROOT; + BUG_ON(!list_empty(new_tree_node_listp)); + + /* + * we need to be safe, the node could be removed by get_uksm_page() + */ + list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { + void *addr; + struct page *node_page; + u32 hash; + + /* + * We are completely re-structuring the stable nodes to a new + * stable tree. We don't want to touch the old tree unlinks and + * old tree_nodes. The old tree_nodes will be freed at once. + */ + node_page = get_uksm_page(node, 0, 0); + if (!node_page) + continue; + + if (node->tree_node) { + hash = node->tree_node->hash; + + addr = kmap_atomic(node_page); + + hash = delta_hash(addr, prev_hash_strength, + hash_strength, hash); + kunmap_atomic(addr); + } else { + /* + *it was not inserted to rbtree due to collision in last + *round scan. + */ + hash = page_hash(node_page, hash_strength, 0); + } + + stable_node_reinsert(node, node_page, root_new_treep, + new_tree_node_listp, hash); + put_page(node_page); + } + + root_stable_treep = root_new_treep; + free_all_tree_nodes(stable_tree_node_listp); + BUG_ON(!list_empty(stable_tree_node_listp)); + stable_tree_node_listp = new_tree_node_listp; +} + +static inline void inc_hash_strength(unsigned long delta) +{ + hash_strength += 1 << delta; + if (hash_strength > HASH_STRENGTH_MAX) + hash_strength = HASH_STRENGTH_MAX; +} + +static inline void dec_hash_strength(unsigned long delta) +{ + unsigned long change = 1 << delta; + + if (hash_strength <= change + 1) + hash_strength = 1; + else + hash_strength -= change; +} + +static inline void inc_hash_strength_delta(void) +{ + hash_strength_delta++; + if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) + hash_strength_delta = HASH_STRENGTH_DELTA_MAX; +} + +/* +static inline unsigned long get_current_neg_ratio(void) +{ + if (!rshash_pos || rshash_neg > rshash_pos) + return 100; + + return div64_u64(100 * rshash_neg , rshash_pos); +} +*/ + +static inline unsigned long get_current_neg_ratio(void) +{ + u64 pos = benefit.pos; + u64 neg = benefit.neg; + + if (!neg) + return 0; + + if (!pos || neg > pos) + return 100; + + if (neg > div64_u64(U64_MAX, 100)) + pos = div64_u64(pos, 100); + else + neg *= 100; + + return div64_u64(neg, pos); +} + +static inline unsigned long get_current_benefit(void) +{ + u64 pos = benefit.pos; + u64 neg = benefit.neg; + u64 scanned = benefit.scanned; + + if (neg > pos) + return 0; + + return div64_u64((pos - neg), scanned); +} + +static inline int judge_rshash_direction(void) +{ + u64 current_neg_ratio, stable_benefit; + u64 current_benefit, delta = 0; + int ret = STILL; + + /* Try to probe a value after the boot, and in case the system + are still for a long time. */ + if ((fully_scanned_round & 0xFFULL) == 10) { + ret = OBSCURE; + goto out; + } + + current_neg_ratio = get_current_neg_ratio(); + + if (current_neg_ratio == 0) { + rshash_neg_cont_zero++; + if (rshash_neg_cont_zero > 2) + return GO_DOWN; + else + return STILL; + } + rshash_neg_cont_zero = 0; + + if (current_neg_ratio > 90) { + ret = GO_UP; + goto out; + } + + current_benefit = get_current_benefit(); + stable_benefit = rshash_state.stable_benefit; + + if (!stable_benefit) { + ret = OBSCURE; + goto out; + } + + if (current_benefit > stable_benefit) + delta = current_benefit - stable_benefit; + else if (current_benefit < stable_benefit) + delta = stable_benefit - current_benefit; + + delta = div64_u64(100 * delta , stable_benefit); + + if (delta > 50) { + rshash_cont_obscure++; + if (rshash_cont_obscure > 2) + return OBSCURE; + else + return STILL; + } + +out: + rshash_cont_obscure = 0; + return ret; +} + +/** + * rshash_adjust() - The main function to control the random sampling state + * machine for hash strength adapting. + * + * return true if hash_strength has changed. + */ +static inline int rshash_adjust(void) +{ + unsigned long prev_hash_strength = hash_strength; + + if (!encode_benefit()) + return 0; + + switch (rshash_state.state) { + case RSHASH_STILL: + switch (judge_rshash_direction()) { + case GO_UP: + if (rshash_state.pre_direct == GO_DOWN) + hash_strength_delta = 0; + + inc_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + rshash_state.stable_benefit = get_current_benefit(); + rshash_state.pre_direct = GO_UP; + break; + + case GO_DOWN: + if (rshash_state.pre_direct == GO_UP) + hash_strength_delta = 0; + + dec_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + rshash_state.stable_benefit = get_current_benefit(); + rshash_state.pre_direct = GO_DOWN; + break; + + case OBSCURE: + rshash_state.stable_point = hash_strength; + rshash_state.turn_point_down = hash_strength; + rshash_state.turn_point_up = hash_strength; + rshash_state.turn_benefit_down = get_current_benefit(); + rshash_state.turn_benefit_up = get_current_benefit(); + rshash_state.lookup_window_index = 0; + rshash_state.state = RSHASH_TRYDOWN; + dec_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + break; + + case STILL: + break; + default: + BUG(); + } + break; + + case RSHASH_TRYDOWN: + if (rshash_state.lookup_window_index++ % 5 == 0) + rshash_state.below_count = 0; + + if (get_current_benefit() < rshash_state.stable_benefit) + rshash_state.below_count++; + else if (get_current_benefit() > + rshash_state.turn_benefit_down) { + rshash_state.turn_point_down = hash_strength; + rshash_state.turn_benefit_down = get_current_benefit(); + } + + if (rshash_state.below_count >= 3 || + judge_rshash_direction() == GO_UP || + hash_strength == 1) { + hash_strength = rshash_state.stable_point; + hash_strength_delta = 0; + inc_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + rshash_state.lookup_window_index = 0; + rshash_state.state = RSHASH_TRYUP; + hash_strength_delta = 0; + } else { + dec_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + } + break; + + case RSHASH_TRYUP: + if (rshash_state.lookup_window_index++ % 5 == 0) + rshash_state.below_count = 0; + + if (get_current_benefit() < rshash_state.turn_benefit_down) + rshash_state.below_count++; + else if (get_current_benefit() > rshash_state.turn_benefit_up) { + rshash_state.turn_point_up = hash_strength; + rshash_state.turn_benefit_up = get_current_benefit(); + } + + if (rshash_state.below_count >= 3 || + judge_rshash_direction() == GO_DOWN || + hash_strength == HASH_STRENGTH_MAX) { + hash_strength = rshash_state.turn_benefit_up > + rshash_state.turn_benefit_down ? + rshash_state.turn_point_up : + rshash_state.turn_point_down; + + rshash_state.state = RSHASH_PRE_STILL; + } else { + inc_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + } + + break; + + case RSHASH_NEW: + case RSHASH_PRE_STILL: + rshash_state.stable_benefit = get_current_benefit(); + rshash_state.state = RSHASH_STILL; + hash_strength_delta = 0; + break; + default: + BUG(); + } + + /* rshash_neg = rshash_pos = 0; */ + reset_benefit(); + + if (prev_hash_strength != hash_strength) + stable_tree_delta_hash(prev_hash_strength); + + return prev_hash_strength != hash_strength; +} + +/** + * round_update_ladder() - The main function to do update of all the + * adjustments whenever a scan round is finished. + */ +static noinline void round_update_ladder(void) +{ + int i; + unsigned long dedup; + struct vma_slot *slot, *tmp_slot; + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; + } + + list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { + + /* slot may be rung_rm_slot() when mm exits */ + if (slot->snode) { + dedup = cal_dedup_ratio_old(slot); + if (dedup && dedup >= uksm_abundant_threshold) + vma_rung_up(slot); + } + + slot->pages_bemerged = 0; + slot->pages_cowed = 0; + + list_del_init(&slot->dedup_list); + } +} + +static void uksm_del_vma_slot(struct vma_slot *slot) +{ + int i, j; + struct rmap_list_entry *entry; + + if (slot->snode) { + /* + * In case it just failed when entering the rung, it's not + * necessary. + */ + rung_rm_slot(slot); + } + + if (!list_empty(&slot->dedup_list)) + list_del(&slot->dedup_list); + + if (!slot->rmap_list_pool || !slot->pool_counts) { + /* In case it OOMed in uksm_vma_enter() */ + goto out; + } + + for (i = 0; i < slot->pool_size; i++) { + void *addr; + + if (!slot->rmap_list_pool[i]) + continue; + + addr = kmap(slot->rmap_list_pool[i]); + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { + entry = (struct rmap_list_entry *)addr + j; + if (is_addr(entry->addr)) + continue; + if (!entry->item) + continue; + + remove_rmap_item_from_tree(entry->item); + free_rmap_item(entry->item); + slot->pool_counts[i]--; + } + BUG_ON(slot->pool_counts[i]); + kunmap(slot->rmap_list_pool[i]); + __free_page(slot->rmap_list_pool[i]); + } + kfree(slot->rmap_list_pool); + kfree(slot->pool_counts); + +out: + slot->rung = NULL; + if (slot->flags & UKSM_SLOT_IN_UKSM) { + BUG_ON(uksm_pages_total < slot->pages); + uksm_pages_total -= slot->pages; + } + + if (slot->fully_scanned_round == fully_scanned_round) + scanned_virtual_pages -= slot->pages; + else + scanned_virtual_pages -= slot->pages_scanned; + free_vma_slot(slot); +} + + +#define SPIN_LOCK_PERIOD 32 +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; +static inline void cleanup_vma_slots(void) +{ + struct vma_slot *slot; + int i; + + i = 0; + spin_lock(&vma_slot_list_lock); + while (!list_empty(&vma_slot_del)) { + slot = list_entry(vma_slot_del.next, + struct vma_slot, slot_list); + list_del(&slot->slot_list); + cleanup_slots[i++] = slot; + if (i == SPIN_LOCK_PERIOD) { + spin_unlock(&vma_slot_list_lock); + while (--i >= 0) + uksm_del_vma_slot(cleanup_slots[i]); + i = 0; + spin_lock(&vma_slot_list_lock); + } + } + spin_unlock(&vma_slot_list_lock); + + while (--i >= 0) + uksm_del_vma_slot(cleanup_slots[i]); +} + +/* +*expotional moving average formula +*/ +static inline unsigned long ema(unsigned long curr, unsigned long last_ema) +{ + /* + * For a very high burst, even the ema cannot work well, a false very + * high per-page time estimation can result in feedback in very high + * overhead of context swith and rung update -- this will then lead + * to higher per-paper time, this may not converge. + * + * Instead, we try to approach this value in a binary manner. + */ + if (curr > last_ema * 10) + return last_ema * 2; + + return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; +} + +/* + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to + * nanoseconds based on current uksm_sleep_jiffies. + */ +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) +{ + return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / + (TIME_RATIO_SCALE - ratio) * ratio; +} + + +static inline unsigned long rung_real_ratio(int cpu_time_ratio) +{ + unsigned long ret; + + BUG_ON(!cpu_time_ratio); + + if (cpu_time_ratio > 0) + ret = cpu_time_ratio; + else + ret = (unsigned long)(-cpu_time_ratio) * + uksm_max_cpu_percentage / 100UL; + + return ret ? ret : 1; +} + +static noinline void uksm_calc_scan_pages(void) +{ + struct scan_rung *ladder = uksm_scan_ladder; + unsigned long sleep_usecs, nsecs; + unsigned long ratio; + int i; + unsigned long per_page; + + if (uksm_ema_page_time > 100000 || + (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) + uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; + + per_page = uksm_ema_page_time; + BUG_ON(!per_page); + + /* + * For every 8 eval round, we try to probe a uksm_sleep_jiffies value + * based on saved user input. + */ + if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) + uksm_sleep_jiffies = uksm_sleep_saved; + + /* We require a rung scan at least 1 page in a period. */ + nsecs = per_page; + ratio = rung_real_ratio(ladder[0].cpu_ratio); + if (cpu_ratio_to_nsec(ratio) < nsecs) { + sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio + / NSEC_PER_USEC; + uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; + } + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + ratio = rung_real_ratio(ladder[i].cpu_ratio); + ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / + per_page; + BUG_ON(!ladder[i].pages_to_scan); + uksm_calc_rung_step(&ladder[i], per_page, ratio); + } +} + +/* + * From the scan time of this round (ns) to next expected min sleep time + * (ms), be careful of the possible overflows. ratio is taken from + * rung_real_ratio() + */ +static inline +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) +{ + scan_time >>= 20; /* to msec level now */ + BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); + + return (unsigned int) ((unsigned long) scan_time * + (TIME_RATIO_SCALE - ratio) / ratio); +} + +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) + +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) +{ + struct scan_rung *rung; + + rung = &uksm_scan_ladder[0]; + rung_add_new_slots(rung, slots, num); +} + +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; + +static void uksm_enter_all_slots(void) +{ + struct vma_slot *slot; + unsigned long index; + struct list_head empty_vma_list; + int i; + + i = 0; + index = 0; + INIT_LIST_HEAD(&empty_vma_list); + + spin_lock(&vma_slot_list_lock); + while (!list_empty(&vma_slot_new)) { + slot = list_entry(vma_slot_new.next, + struct vma_slot, slot_list); + + if (!slot->vma->anon_vma) { + list_move(&slot->slot_list, &empty_vma_list); + } else if (vma_can_enter(slot->vma)) { + batch_slots[index++] = slot; + list_del_init(&slot->slot_list); + } else { + list_move(&slot->slot_list, &vma_slot_noadd); + } + + if (++i == SPIN_LOCK_PERIOD || + (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { + spin_unlock(&vma_slot_list_lock); + + if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { + uksm_vma_enter(batch_slots, index); + index = 0; + } + i = 0; + cond_resched(); + spin_lock(&vma_slot_list_lock); + } + } + + list_splice(&empty_vma_list, &vma_slot_new); + + spin_unlock(&vma_slot_list_lock); + + if (index) + uksm_vma_enter(batch_slots, index); + +} + +static inline int rung_round_finished(struct scan_rung *rung) +{ + return rung->flags & UKSM_RUNG_ROUND_FINISHED; +} + +static inline void judge_slot(struct vma_slot *slot) +{ + struct scan_rung *rung = slot->rung; + unsigned long dedup; + int deleted; + + dedup = cal_dedup_ratio(slot); + if (vma_fully_scanned(slot) && uksm_thrash_threshold) + deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); + else if (dedup && dedup >= uksm_abundant_threshold) + deleted = vma_rung_up(slot); + else + deleted = vma_rung_down(slot); + + slot->pages_merged = 0; + slot->pages_cowed = 0; + + if (vma_fully_scanned(slot)) + slot->pages_scanned = 0; + + slot->last_scanned = slot->pages_scanned; + + /* If its deleted in above, then rung was already advanced. */ + if (!deleted) + advance_current_scan(rung); +} + + +static inline int hash_round_finished(void) +{ + if (scanned_virtual_pages > (uksm_pages_total >> 2)) { + scanned_virtual_pages = 0; + if (uksm_pages_scanned) + fully_scanned_round++; + + return 1; + } else { + return 0; + } +} + +#define UKSM_MMSEM_BATCH 5 +#define BUSY_RETRY 100 + +/** + * uksm_do_scan() - the main worker function. + */ +static noinline void uksm_do_scan(void) +{ + struct vma_slot *slot, *iter; + struct mm_struct *busy_mm; + unsigned char round_finished, all_rungs_emtpy; + int i, err, mmsem_batch; + unsigned long pcost; + long long delta_exec; + unsigned long vpages, max_cpu_ratio; + unsigned long long start_time, end_time, scan_time; + unsigned int expected_jiffies; + + might_sleep(); + + vpages = 0; + + start_time = task_sched_runtime(current); + max_cpu_ratio = 0; + mmsem_batch = 0; + + for (i = 0; i < SCAN_LADDER_SIZE;) { + struct scan_rung *rung = &uksm_scan_ladder[i]; + unsigned long ratio; + int busy_retry; + + if (!rung->pages_to_scan) { + i++; + continue; + } + + if (!rung->vma_root.num) { + rung->pages_to_scan = 0; + i++; + continue; + } + + ratio = rung_real_ratio(rung->cpu_ratio); + if (ratio > max_cpu_ratio) + max_cpu_ratio = ratio; + + busy_retry = BUSY_RETRY; + /* + * Do not consider rung_round_finished() here, just used up the + * rung->pages_to_scan quota. + */ + while (rung->pages_to_scan && rung->vma_root.num && + likely(!freezing(current))) { + int reset = 0; + + slot = rung->current_scan; + + BUG_ON(vma_fully_scanned(slot)); + + if (mmsem_batch) { + err = 0; + } else { + err = try_down_read_slot_mmap_sem(slot); + } + + if (err == -ENOENT) { +rm_slot: + rung_rm_slot(slot); + continue; + } + + busy_mm = slot->mm; + + if (err == -EBUSY) { + /* skip other vmas on the same mm */ + do { + reset = advance_current_scan(rung); + iter = rung->current_scan; + busy_retry--; + if (iter->vma->vm_mm != busy_mm || + !busy_retry || reset) + break; + } while (1); + + if (iter->vma->vm_mm != busy_mm) { + continue; + } else { + /* scan round finsished */ + break; + } + } + + BUG_ON(!vma_can_enter(slot->vma)); + if (uksm_test_exit(slot->vma->vm_mm)) { + mmsem_batch = 0; + up_read(&slot->vma->vm_mm->mmap_sem); + goto rm_slot; + } + + if (mmsem_batch) + mmsem_batch--; + else + mmsem_batch = UKSM_MMSEM_BATCH; + + /* Ok, we have take the mmap_sem, ready to scan */ + scan_vma_one_page(slot); + rung->pages_to_scan--; + vpages++; + + if (rung->current_offset + rung->step > slot->pages - 1 + || vma_fully_scanned(slot)) { + up_read(&slot->vma->vm_mm->mmap_sem); + judge_slot(slot); + mmsem_batch = 0; + } else { + rung->current_offset += rung->step; + if (!mmsem_batch) + up_read(&slot->vma->vm_mm->mmap_sem); + } + + busy_retry = BUSY_RETRY; + cond_resched(); + } + + if (mmsem_batch) { + up_read(&slot->vma->vm_mm->mmap_sem); + mmsem_batch = 0; + } + + if (freezing(current)) + break; + + cond_resched(); + } + end_time = task_sched_runtime(current); + delta_exec = end_time - start_time; + + if (freezing(current)) + return; + + cleanup_vma_slots(); + uksm_enter_all_slots(); + + round_finished = 1; + all_rungs_emtpy = 1; + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + struct scan_rung *rung = &uksm_scan_ladder[i]; + + if (rung->vma_root.num) { + all_rungs_emtpy = 0; + if (!rung_round_finished(rung)) + round_finished = 0; + } + } + + if (all_rungs_emtpy) + round_finished = 0; + + if (round_finished) { + round_update_ladder(); + uksm_eval_round++; + + if (hash_round_finished() && rshash_adjust()) { + /* Reset the unstable root iff hash strength changed */ + uksm_hash_round++; + root_unstable_tree = RB_ROOT; + free_all_tree_nodes(&unstable_tree_node_list); + } + + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to uksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + } + + + if (vpages && delta_exec > 0) { + pcost = (unsigned long) delta_exec / vpages; + if (likely(uksm_ema_page_time)) + uksm_ema_page_time = ema(pcost, uksm_ema_page_time); + else + uksm_ema_page_time = pcost; + } + + uksm_calc_scan_pages(); + uksm_sleep_real = uksm_sleep_jiffies; + /* in case of radical cpu bursts, apply the upper bound */ + end_time = task_sched_runtime(current); + if (max_cpu_ratio && end_time > start_time) { + scan_time = end_time - start_time; + expected_jiffies = msecs_to_jiffies( + scan_time_to_sleep(scan_time, max_cpu_ratio)); + + if (expected_jiffies > uksm_sleep_real) + uksm_sleep_real = expected_jiffies; + + /* We have a 1 second up bound for responsiveness. */ + if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) + uksm_sleep_real = msecs_to_jiffies(1000); + } + + return; +} + +static int ksmd_should_run(void) +{ + return uksm_run & UKSM_RUN_MERGE; +} + +static int uksm_scan_thread(void *nothing) +{ + set_freezable(); + set_user_nice(current, 5); + + while (!kthread_should_stop()) { + mutex_lock(&uksm_thread_mutex); + if (ksmd_should_run()) { + uksm_do_scan(); + } + mutex_unlock(&uksm_thread_mutex); + + try_to_freeze(); + + if (ksmd_should_run()) { + schedule_timeout_interruptible(uksm_sleep_real); + uksm_sleep_times++; + } else { + wait_event_freezable(uksm_thread_wait, + ksmd_should_run() || kthread_should_stop()); + } + } + return 0; +} + +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) +{ + struct stable_node *stable_node; + struct node_vma *node_vma; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + unsigned long address; + + VM_BUG_ON_PAGE(!PageKsm(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + + stable_node = page_stable_node(page); + if (!stable_node) + return ret; +again: + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + anon_vma_lock_read(anon_vma); + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { + vma = vmac->vma; + address = get_rmap_addr(rmap_item); + + if (address < vma->vm_start || + address >= vma->vm_end) + continue; + + if ((rmap_item->slot->vma == vma) == + search_new_forks) + continue; + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + + ret = rwc->rmap_one(page, vma, address, rwc->arg); + if (ret != SWAP_AGAIN) { + anon_vma_unlock_read(anon_vma); + goto out; + } + + if (rwc->done && rwc->done(page)) { + anon_vma_unlock_read(anon_vma); + goto out; + } + } + anon_vma_unlock_read(anon_vma); + } + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +#ifdef CONFIG_MIGRATION +/* Common ksm interface but may be specific to uksm */ +void ksm_migrate_page(struct page *newpage, struct page *oldpage) +{ + struct stable_node *stable_node; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON(newpage->mapping != oldpage->mapping); + + stable_node = page_stable_node(newpage); + if (stable_node) { + VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); + stable_node->kpfn = page_to_pfn(newpage); + } +} +#endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_MEMORY_HOTREMOVE +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, + unsigned long end_pfn) +{ + struct rb_node *node; + + for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { + struct stable_node *stable_node; + + stable_node = rb_entry(node, struct stable_node, node); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) + return stable_node; + } + return NULL; +} + +static int uksm_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + struct stable_node *stable_node; + + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Keep it very simple for now: just lock out ksmd and + * MADV_UNMERGEABLE while any memory is going offline. + * mutex_lock_nested() is necessary because lockdep was alarmed + * that here we take uksm_thread_mutex inside notifier chain + * mutex, and later take notifier chain mutex inside + * uksm_thread_mutex to unlock it. But that's safe because both + * are inside mem_hotplug_mutex. + */ + mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); + break; + + case MEM_OFFLINE: + /* + * Most of the work is done by page migration; but there might + * be a few stable_nodes left over, still pointing to struct + * pages which have been offlined: prune those from the tree. + */ + while ((stable_node = uksm_check_stable_tree(mn->start_pfn, + mn->start_pfn + mn->nr_pages)) != NULL) + remove_node_from_stable_tree(stable_node, 1, 1); + /* fallthrough */ + + case MEM_CANCEL_OFFLINE: + mutex_unlock(&uksm_thread_mutex); + break; + } + return NOTIFY_OK; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +#ifdef CONFIG_SYSFS +/* + * This all compiles without CONFIG_SYSFS, but is a waste of space. + */ + +#define UKSM_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define UKSM_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t max_cpu_percentage_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", uksm_max_cpu_percentage); +} + +static ssize_t max_cpu_percentage_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long max_cpu_percentage; + int err; + + err = kstrtoul(buf, 10, &max_cpu_percentage); + if (err || max_cpu_percentage > 100) + return -EINVAL; + + if (max_cpu_percentage == 100) + max_cpu_percentage = 99; + else if (max_cpu_percentage < 10) + max_cpu_percentage = 10; + + uksm_max_cpu_percentage = max_cpu_percentage; + + return count; +} +UKSM_ATTR(max_cpu_percentage); + +static ssize_t sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); +} + +static ssize_t sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > MSEC_PER_SEC) + return -EINVAL; + + uksm_sleep_jiffies = msecs_to_jiffies(msecs); + uksm_sleep_saved = uksm_sleep_jiffies; + + return count; +} +UKSM_ATTR(sleep_millisecs); + + +static ssize_t cpu_governor_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); + int i; + + buf[0] = '\0'; + for (i = 0; i < n ; i++) { + if (uksm_cpu_governor == i) + strcat(buf, "["); + + strcat(buf, uksm_cpu_governor_str[i]); + + if (uksm_cpu_governor == i) + strcat(buf, "]"); + + strcat(buf, " "); + } + strcat(buf, "\n"); + + return strlen(buf); +} + +static inline void init_performance_values(void) +{ + int i; + struct scan_rung *rung; + struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; + + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = uksm_scan_ladder + i; + rung->cpu_ratio = preset->cpu_ratio[i]; + rung->cover_msecs = preset->cover_msecs[i]; + } + + uksm_max_cpu_percentage = preset->max_cpu; +} + +static ssize_t cpu_governor_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); + + for (n--; n >=0 ; n--) { + if (!strncmp(buf, uksm_cpu_governor_str[n], + strlen(uksm_cpu_governor_str[n]))) + break; + } + + if (n < 0) + return -EINVAL; + else + uksm_cpu_governor = n; + + init_performance_values(); + + return count; +} +UKSM_ATTR(cpu_governor); + +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", uksm_run); +} + +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long flags; + + err = kstrtoul(buf, 10, &flags); + if (err || flags > UINT_MAX) + return -EINVAL; + if (flags > UKSM_RUN_MERGE) + return -EINVAL; + + mutex_lock(&uksm_thread_mutex); + if (uksm_run != flags) { + uksm_run = flags; + } + mutex_unlock(&uksm_thread_mutex); + + if (flags & UKSM_RUN_MERGE) + wake_up_interruptible(&uksm_thread_wait); + + return count; +} +UKSM_ATTR(run); + +static ssize_t abundant_threshold_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", uksm_abundant_threshold); +} + +static ssize_t abundant_threshold_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long flags; + + err = kstrtoul(buf, 10, &flags); + if (err || flags > 99) + return -EINVAL; + + uksm_abundant_threshold = flags; + + return count; +} +UKSM_ATTR(abundant_threshold); + +static ssize_t thrash_threshold_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", uksm_thrash_threshold); +} + +static ssize_t thrash_threshold_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long flags; + + err = kstrtoul(buf, 10, &flags); + if (err || flags > 99) + return -EINVAL; + + uksm_thrash_threshold = flags; + + return count; +} +UKSM_ATTR(thrash_threshold); + +static ssize_t cpu_ratios_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, size; + struct scan_rung *rung; + char *p = buf; + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = &uksm_scan_ladder[i]; + + if (rung->cpu_ratio > 0) + size = sprintf(p, "%d ", rung->cpu_ratio); + else + size = sprintf(p, "MAX/%d ", + TIME_RATIO_SCALE / -rung->cpu_ratio); + + p += size; + } + + *p++ = '\n'; + *p = '\0'; + + return p - buf; +} + +static ssize_t cpu_ratios_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int i, cpuratios[SCAN_LADDER_SIZE], err; + unsigned long value; + struct scan_rung *rung; + char *p, *end = NULL; + + p = kzalloc(count, GFP_KERNEL); + if (!p) + return -ENOMEM; + + memcpy(p, buf, count); + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + if (i != SCAN_LADDER_SIZE -1) { + end = strchr(p, ' '); + if (!end) + return -EINVAL; + + *end = '\0'; + } + + if (strstr(p, "MAX/")) { + p = strchr(p, '/') + 1; + err = kstrtoul(p, 10, &value); + if (err || value > TIME_RATIO_SCALE || !value) + return -EINVAL; + + cpuratios[i] = - (int) (TIME_RATIO_SCALE / value); + } else { + err = kstrtoul(p, 10, &value); + if (err || value > TIME_RATIO_SCALE || !value) + return -EINVAL; + + cpuratios[i] = value; + } + + p = end + 1; + } + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = &uksm_scan_ladder[i]; + + rung->cpu_ratio = cpuratios[i]; + } + + return count; +} +UKSM_ATTR(cpu_ratios); + +static ssize_t eval_intervals_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, size; + struct scan_rung *rung; + char *p = buf; + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = &uksm_scan_ladder[i]; + size = sprintf(p, "%u ", rung->cover_msecs); + p += size; + } + + *p++ = '\n'; + *p = '\0'; + + return p - buf; +} + +static ssize_t eval_intervals_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int i, err; + unsigned long values[SCAN_LADDER_SIZE]; + struct scan_rung *rung; + char *p, *end = NULL; + + p = kzalloc(count, GFP_KERNEL); + if (!p) + return -ENOMEM; + + memcpy(p, buf, count); + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + if (i != SCAN_LADDER_SIZE -1) { + end = strchr(p, ' '); + if (!end) + return -EINVAL; + + *end = '\0'; + } + + err = kstrtoul(p, 10, &values[i]); + if (err) + return -EINVAL; + + p = end + 1; + } + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = &uksm_scan_ladder[i]; + + rung->cover_msecs = values[i]; + } + + return count; +} +UKSM_ATTR(eval_intervals); + +static ssize_t ema_per_page_time_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", uksm_ema_page_time); +} +UKSM_ATTR_RO(ema_per_page_time); + +static ssize_t pages_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", uksm_pages_shared); +} +UKSM_ATTR_RO(pages_shared); + +static ssize_t pages_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", uksm_pages_sharing); +} +UKSM_ATTR_RO(pages_sharing); + +static ssize_t pages_unshared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", uksm_pages_unshared); +} +UKSM_ATTR_RO(pages_unshared); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", fully_scanned_round); +} +UKSM_ATTR_RO(full_scans); + +static ssize_t pages_scanned_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned long base = 0; + u64 delta, ret; + + if (pages_scanned_stored) { + base = pages_scanned_base; + ret = pages_scanned_stored; + delta = uksm_pages_scanned >> base; + if (CAN_OVERFLOW_U64(ret, delta)) { + ret >>= 1; + delta >>= 1; + base++; + ret += delta; + } + } else { + ret = uksm_pages_scanned; + } + + while (ret > ULONG_MAX) { + ret >>= 1; + base++; + } + + if (base) + return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); + else + return sprintf(buf, "%lu\n", (unsigned long)ret); +} +UKSM_ATTR_RO(pages_scanned); + +static ssize_t hash_strength_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", hash_strength); +} +UKSM_ATTR_RO(hash_strength); + +static ssize_t sleep_times_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", uksm_sleep_times); +} +UKSM_ATTR_RO(sleep_times); + + +static struct attribute *uksm_attrs[] = { + &max_cpu_percentage_attr.attr, + &sleep_millisecs_attr.attr, + &cpu_governor_attr.attr, + &run_attr.attr, + &ema_per_page_time_attr.attr, + &pages_shared_attr.attr, + &pages_sharing_attr.attr, + &pages_unshared_attr.attr, + &full_scans_attr.attr, + &pages_scanned_attr.attr, + &hash_strength_attr.attr, + &sleep_times_attr.attr, + &thrash_threshold_attr.attr, + &abundant_threshold_attr.attr, + &cpu_ratios_attr.attr, + &eval_intervals_attr.attr, + NULL, +}; + +static struct attribute_group uksm_attr_group = { + .attrs = uksm_attrs, + .name = "uksm", +}; +#endif /* CONFIG_SYSFS */ + +static inline void init_scan_ladder(void) +{ + int i; + struct scan_rung *rung; + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = uksm_scan_ladder + i; + slot_tree_init_root(&rung->vma_root); + } + + init_performance_values(); + uksm_calc_scan_pages(); +} + +static inline int cal_positive_negative_costs(void) +{ + struct page *p1, *p2; + unsigned char *addr1, *addr2; + unsigned long i, time_start, hash_cost; + unsigned long loopnum = 0; + + /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ + volatile u32 hash; + volatile int ret; + + p1 = alloc_page(GFP_KERNEL); + if (!p1) + return -ENOMEM; + + p2 = alloc_page(GFP_KERNEL); + if (!p2) + return -ENOMEM; + + addr1 = kmap_atomic(p1); + addr2 = kmap_atomic(p2); + memset(addr1, prandom_u32(), PAGE_SIZE); + memcpy(addr2, addr1, PAGE_SIZE); + + /* make sure that the two pages differ in last byte */ + addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; + kunmap_atomic(addr2); + kunmap_atomic(addr1); + + time_start = jiffies; + while (jiffies - time_start < 100) { + for (i = 0; i < 100; i++) + hash = page_hash(p1, HASH_STRENGTH_FULL, 0); + loopnum += 100; + } + hash_cost = (jiffies - time_start); + + time_start = jiffies; + for (i = 0; i < loopnum; i++) + ret = pages_identical(p1, p2); + memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); + memcmp_cost /= hash_cost; + printk(KERN_INFO "UKSM: relative memcmp_cost = %lu " + "hash=%u cmp_ret=%d.\n", + memcmp_cost, hash, ret); + + __free_page(p1); + __free_page(p2); + return 0; +} + +static int init_zeropage_hash_table(void) +{ + struct page *page; + char *addr; + int i; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + addr = kmap_atomic(page); + memset(addr, 0, PAGE_SIZE); + kunmap_atomic(addr); + + zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32), + GFP_KERNEL); + if (!zero_hash_table) + return -ENOMEM; + + for (i = 0; i < HASH_STRENGTH_MAX; i++) + zero_hash_table[i] = page_hash(page, i, 0); + + __free_page(page); + + return 0; +} + +static inline int init_random_sampling(void) +{ + unsigned long i; + random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!random_nums) + return -ENOMEM; + + for (i = 0; i < HASH_STRENGTH_FULL; i++) + random_nums[i] = i; + + for (i = 0; i < HASH_STRENGTH_FULL; i++) { + unsigned long rand_range, swap_index, tmp; + + rand_range = HASH_STRENGTH_FULL - i; + swap_index = i + prandom_u32() % rand_range; + tmp = random_nums[i]; + random_nums[i] = random_nums[swap_index]; + random_nums[swap_index] = tmp; + } + + rshash_state.state = RSHASH_NEW; + rshash_state.below_count = 0; + rshash_state.lookup_window_index = 0; + + return cal_positive_negative_costs(); +} + +static int __init uksm_slab_init(void) +{ + rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); + if (!rmap_item_cache) + goto out; + + stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); + if (!stable_node_cache) + goto out_free1; + + node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); + if (!node_vma_cache) + goto out_free2; + + vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); + if (!vma_slot_cache) + goto out_free3; + + tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); + if (!tree_node_cache) + goto out_free4; + + return 0; + +out_free4: + kmem_cache_destroy(vma_slot_cache); +out_free3: + kmem_cache_destroy(node_vma_cache); +out_free2: + kmem_cache_destroy(stable_node_cache); +out_free1: + kmem_cache_destroy(rmap_item_cache); +out: + return -ENOMEM; +} + +static void __init uksm_slab_free(void) +{ + kmem_cache_destroy(stable_node_cache); + kmem_cache_destroy(rmap_item_cache); + kmem_cache_destroy(node_vma_cache); + kmem_cache_destroy(vma_slot_cache); + kmem_cache_destroy(tree_node_cache); +} + +/* Common interface to ksm, different to it. */ +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + int err; + + switch (advice) { + case MADV_MERGEABLE: + return 0; /* just ignore the advice */ + + case MADV_UNMERGEABLE: + if (!(*vm_flags & VM_MERGEABLE)) + return 0; /* just ignore the advice */ + + if (vma->anon_vma) { + err = unmerge_uksm_pages(vma, start, end); + if (err) + return err; + } + + uksm_remove_vma(vma); + *vm_flags &= ~VM_MERGEABLE; + break; + } + + return 0; +} + +/* Common interface to ksm, actually the same. */ +struct page *ksm_might_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = page_anon_vma(page); + struct page *new_page; + + if (PageKsm(page)) { + if (page_stable_node(page)) + return page; /* no need to copy it */ + } else if (!anon_vma) { + return page; /* no need to copy it */ + } else if (anon_vma->root == vma->anon_vma->root && + page->index == linear_page_index(vma, address)) { + return page; /* still no need to copy it */ + } + if (!PageUptodate(page)) + return page; /* let do_swap_page report the error */ + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page) { + copy_user_highpage(new_page, page, address, vma); + + SetPageDirty(new_page); + __SetPageUptodate(new_page); + __set_page_locked(new_page); + } + + return new_page; +} + +static int __init uksm_init(void) +{ + struct task_struct *uksm_thread; + int err; + + uksm_sleep_jiffies = msecs_to_jiffies(100); + uksm_sleep_saved = uksm_sleep_jiffies; + + slot_tree_init(); + init_scan_ladder(); + + + err = init_random_sampling(); + if (err) + goto out_free2; + + err = uksm_slab_init(); + if (err) + goto out_free1; + + err = init_zeropage_hash_table(); + if (err) + goto out_free0; + + uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); + if (IS_ERR(uksm_thread)) { + printk(KERN_ERR "uksm: creating kthread failed\n"); + err = PTR_ERR(uksm_thread); + goto out_free; + } + +#ifdef CONFIG_SYSFS + err = sysfs_create_group(mm_kobj, &uksm_attr_group); + if (err) { + printk(KERN_ERR "uksm: register sysfs failed\n"); + kthread_stop(uksm_thread); + goto out_free; + } +#else + uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ + +#endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MEMORY_HOTREMOVE + /* + * Choose a high priority since the callback takes uksm_thread_mutex: + * later callbacks could only be taking locks which nest within that. + */ + hotplug_memory_notifier(uksm_memory_callback, 100); +#endif + return 0; + +out_free: + kfree(zero_hash_table); +out_free0: + uksm_slab_free(); +out_free1: + kfree(random_nums); +out_free2: + kfree(uksm_scan_ladder); + return err; +} + +#ifdef MODULE +subsys_initcall(ksm_init); +#else +late_initcall(uksm_init); +#endif + diff -Nur linux-4.2/mm/vmstat.c linux-4.2-pck/mm/vmstat.c --- linux-4.2/mm/vmstat.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/vmstat.c 2015-09-08 00:51:03.470667662 -0300 @@ -740,6 +740,9 @@ "nr_anon_transparent_hugepages", "nr_free_cma", +#ifdef CONFIG_UKSM + "nr_uksm_zero_pages", +#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", diff -Nur linux-4.2/net/core/secure_seq.c linux-4.2-pck/net/core/secure_seq.c --- linux-4.2/net/core/secure_seq.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/core/secure_seq.c 2015-09-08 00:48:31.501257347 -0300 @@ -8,7 +8,11 @@ #include #include #include +#include +#include +#include +#include #include #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) @@ -39,6 +43,102 @@ } #endif +#ifdef CONFIG_TCP_STEALTH +u32 tcp_stealth_sequence_number(struct sock *sk, __be32 *daddr, + u32 daddr_size, __be16 dport) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_key *md5; + + __u32 sec[MD5_MESSAGE_BYTES / sizeof(__u32)]; + __u32 i; + __u32 tsval = 0; + + __be32 iv[MD5_DIGEST_WORDS] = { 0 }; + __be32 isn; + + memcpy(iv, daddr, (daddr_size > sizeof(iv)) ? sizeof(iv) : daddr_size); + +#ifdef CONFIG_TCP_MD5SIG + md5 = tp->af_specific->md5_lookup(sk, sk); +#else + md5 = NULL; +#endif + if (likely(sysctl_tcp_timestamps && !md5) || tp->stealth.saw_tsval) + tsval = tp->stealth.mstamp.stamp_jiffies; + + ((__be16 *)iv)[2] ^= cpu_to_be16(tp->stealth.integrity_hash); + iv[2] ^= cpu_to_be32(tsval); + ((__be16 *)iv)[6] ^= dport; + + for (i = 0; i < MD5_DIGEST_WORDS; i++) + iv[i] = le32_to_cpu(iv[i]); + for (i = 0; i < MD5_MESSAGE_BYTES / sizeof(__le32); i++) + sec[i] = le32_to_cpu(((__le32 *)tp->stealth.secret)[i]); + + md5_transform(iv, sec); + + isn = cpu_to_be32(iv[0]) ^ cpu_to_be32(iv[1]) ^ + cpu_to_be32(iv[2]) ^ cpu_to_be32(iv[3]); + + if (tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY) + be32_isn_to_be16_ih(isn) = + cpu_to_be16(tp->stealth.integrity_hash); + + return be32_to_cpu(isn); +} +EXPORT_SYMBOL(tcp_stealth_sequence_number); + +u32 tcp_stealth_do_auth(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcphdr *th = tcp_hdr(skb); + __be32 isn = th->seq; + __be32 hash; + __be32 *daddr; + u32 daddr_size; + + tp->stealth.saw_tsval = + tcp_parse_tsval_option(&tp->stealth.mstamp.stamp_jiffies, th); + + if (tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) + tp->stealth.integrity_hash = + be16_to_cpu(be32_isn_to_be16_ih(isn)); + + switch (tp->inet_conn.icsk_inet.sk.sk_family) { +#if IS_ENABLED(CONFIG_IPV6) + case PF_INET6: + daddr_size = sizeof(ipv6_hdr(skb)->daddr.s6_addr32); + daddr = ipv6_hdr(skb)->daddr.s6_addr32; + break; +#endif + case PF_INET: + daddr_size = sizeof(ip_hdr(skb)->daddr); + daddr = &ip_hdr(skb)->daddr; + break; + default: + pr_err("TCP Stealth: Unknown network layer protocol, stop!\n"); + return 1; + } + + hash = tcp_stealth_sequence_number(sk, daddr, daddr_size, th->dest); + cpu_to_be32s(&hash); + + if (tp->stealth.mode & TCP_STEALTH_MODE_AUTH && + tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN && + be32_isn_to_be16_av(isn) == be32_isn_to_be16_av(hash)) + return 0; + + if (tp->stealth.mode & TCP_STEALTH_MODE_AUTH && + !(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) && + isn == hash) + return 0; + + return 1; +} +EXPORT_SYMBOL(tcp_stealth_do_auth); +#endif + #if IS_ENABLED(CONFIG_IPV6) __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport) diff -Nur linux-4.2/net/ethernet/eth.c linux-4.2-pck/net/ethernet/eth.c --- linux-4.2/net/ethernet/eth.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ethernet/eth.c 2015-09-08 00:48:22.241696581 -0300 @@ -355,7 +355,7 @@ dev->hard_header_len = ETH_HLEN; dev->mtu = ETH_DATA_LEN; dev->addr_len = ETH_ALEN; - dev->tx_queue_len = 1000; /* Ethernet wants good queues */ + dev->tx_queue_len = 50; /* Ethernet wants good latency. Use FreeBSD defaults. */ dev->flags = IFF_BROADCAST|IFF_MULTICAST; dev->priv_flags |= IFF_TX_SKB_SHARING; diff -Nur linux-4.2/net/ipv4/Kconfig linux-4.2-pck/net/ipv4/Kconfig --- linux-4.2/net/ipv4/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv4/Kconfig 2015-09-08 00:48:31.501257347 -0300 @@ -653,6 +653,9 @@ config DEFAULT_VEGAS bool "Vegas" if TCP_CONG_VEGAS=y + config DEFAULT_YEAH + bool "YeAH" if TCP_CONG_YEAH=y + config DEFAULT_VENO bool "Veno" if TCP_CONG_VENO=y @@ -683,6 +686,7 @@ default "htcp" if DEFAULT_HTCP default "hybla" if DEFAULT_HYBLA default "vegas" if DEFAULT_VEGAS + default "yeah" if DEFAULT_YEAH default "westwood" if DEFAULT_WESTWOOD default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO @@ -700,3 +704,13 @@ on the Internet. If unsure, say N. + +config TCP_STEALTH + bool "TCP: Stealth TCP socket support" + default n + ---help--- + This option enables support for stealth TCP sockets. If you do not + know what this means, you do not need it. + + If unsure, say N. + diff -Nur linux-4.2/net/ipv4/tcp.c linux-4.2-pck/net/ipv4/tcp.c --- linux-4.2/net/ipv4/tcp.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv4/tcp.c 2015-09-08 00:48:31.504590523 -0300 @@ -2313,6 +2313,43 @@ return 0; } +#ifdef CONFIG_TCP_STEALTH +int tcp_stealth_integrity(__be16 *hash, u8 *secret, u8 *payload, int len) +{ + struct scatterlist sg[2]; + struct crypto_hash *tfm; + struct hash_desc desc; + __be16 h[MD5_DIGEST_WORDS * 2]; + int i; + int err = 0; + + tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + err = -PTR_ERR(tfm); + goto out; + } + desc.tfm = tfm; + desc.flags = 0; + + sg_init_table(sg, 2); + sg_set_buf(&sg[0], secret, MD5_MESSAGE_BYTES); + sg_set_buf(&sg[1], payload, len); + + if (crypto_hash_digest(&desc, sg, MD5_MESSAGE_BYTES + len, (u8 *)h)) { + err = -EFAULT; + goto out; + } + + *hash = be16_to_cpu(h[0]); + for (i = 1; i < MD5_DIGEST_WORDS * 2; i++) + *hash ^= be16_to_cpu(h[i]); + +out: + crypto_free_hash(tfm); + return err; +} +#endif + /* * Socket option code for TCP. */ @@ -2343,6 +2380,66 @@ release_sock(sk); return err; } +#ifdef CONFIG_TCP_STEALTH + case TCP_STEALTH: { + u8 secret[MD5_MESSAGE_BYTES] = { 0 }; + + val = copy_from_user(secret, optval, + min_t(unsigned int, optlen, + MD5_MESSAGE_BYTES)); + if (val != 0) + return -EFAULT; + + lock_sock(sk); + memcpy(tp->stealth.secret, secret, MD5_MESSAGE_BYTES); + tp->stealth.mode = TCP_STEALTH_MODE_AUTH; + tp->stealth.mstamp.v64 = 0; + tp->stealth.saw_tsval = false; + release_sock(sk); + return err; + } + case TCP_STEALTH_INTEGRITY: { + u8 *payload; + + lock_sock(sk); + + if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) { + err = -EOPNOTSUPP; + goto stealth_integrity_out_1; + } + + if (optlen < 1 || optlen > USHRT_MAX) { + err = -EINVAL; + goto stealth_integrity_out_1; + } + + payload = vmalloc(optlen); + if (!payload) { + err = -ENOMEM; + goto stealth_integrity_out_1; + } + + val = copy_from_user(payload, optval, optlen); + if (val != 0) { + err = -EFAULT; + goto stealth_integrity_out_2; + } + + err = tcp_stealth_integrity(&tp->stealth.integrity_hash, + tp->stealth.secret, payload, + optlen); + if (err) + goto stealth_integrity_out_2; + + tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY; + +stealth_integrity_out_2: + vfree(payload); +stealth_integrity_out_1: + release_sock(sk); + return err; + } +#endif default: /* fallthru */ break; @@ -2594,6 +2691,18 @@ tp->notsent_lowat = val; sk->sk_write_space(sk); break; +#ifdef CONFIG_TCP_STEALTH + case TCP_STEALTH_INTEGRITY_LEN: + if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) { + err = -EOPNOTSUPP; + } else if (val < 1 || val > USHRT_MAX) { + err = -EINVAL; + } else { + tp->stealth.integrity_len = val; + tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY_LEN; + } + break; +#endif default: err = -ENOPROTOOPT; break; diff -Nur linux-4.2/net/ipv4/tcp_input.c linux-4.2-pck/net/ipv4/tcp_input.c --- linux-4.2/net/ipv4/tcp_input.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv4/tcp_input.c 2015-09-08 00:48:31.504590523 -0300 @@ -77,6 +77,9 @@ #include int sysctl_tcp_timestamps __read_mostly = 1; +#ifdef CONFIG_TCP_STEALTH +EXPORT_SYMBOL(sysctl_tcp_timestamps); +#endif int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; @@ -3796,6 +3799,47 @@ return true; } +#ifdef CONFIG_TCP_STEALTH +/* Parse only the TSVal field of the TCP Timestamp option header. + */ +const bool tcp_parse_tsval_option(u32 *tsval, const struct tcphdr *th) +{ + int length = (th->doff << 2) - sizeof(*th); + const u8 *ptr = (const u8 *)(th + 1); + + /* If the TCP option is too short, we can short cut */ + if (length < TCPOLEN_TIMESTAMP) + return false; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return false; + case TCPOPT_NOP: + length--; + continue; + case TCPOPT_TIMESTAMP: + opsize = *ptr++; + if (opsize != TCPOLEN_TIMESTAMP || opsize > length) + return false; + *tsval = get_unaligned_be32(ptr); + return true; + default: + opsize = *ptr++; + if (opsize < 2 || opsize > length) + return false; + } + ptr += opsize - 2; + length -= opsize; + } + return false; +} +EXPORT_SYMBOL(tcp_parse_tsval_option); +#endif + #ifdef CONFIG_TCP_MD5SIG /* * Parse MD5 Signature option @@ -4465,6 +4509,31 @@ return -ENOMEM; } +#ifdef CONFIG_TCP_STEALTH +static int __tcp_stealth_integrity_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + u16 hash; + __be32 seq = cpu_to_be32(TCP_SKB_CB(skb)->seq - 1); + char *data = skb->data + th->doff * 4; + int len = skb->len - th->doff * 4; + + if (len < tp->stealth.integrity_len) + return 1; + + if (tcp_stealth_integrity(&hash, tp->stealth.secret, data, + tp->stealth.integrity_len)) + return 1; + + if (be32_isn_to_be16_ih(seq) != cpu_to_be16(hash)) + return 1; + + tp->stealth.mode &= ~TCP_STEALTH_MODE_INTEGRITY_LEN; + return 0; +} +#endif + static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -4474,6 +4543,14 @@ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) goto drop; +#ifdef CONFIG_TCP_STEALTH + if (unlikely(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) && + __tcp_stealth_integrity_check(sk, skb)) { + tcp_reset(sk); + goto drop; + } +#endif + skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); @@ -5246,6 +5323,15 @@ int eaten = 0; bool fragstolen = false; +#ifdef CONFIG_TCP_STEALTH + if (unlikely(tp->stealth.mode & + TCP_STEALTH_MODE_INTEGRITY_LEN) && + __tcp_stealth_integrity_check(sk, skb)) { + tcp_reset(sk); + goto discard; + } +#endif + if (tp->ucopy.task == current && tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len && diff -Nur linux-4.2/net/ipv4/tcp_ipv4.c linux-4.2-pck/net/ipv4/tcp_ipv4.c --- linux-4.2/net/ipv4/tcp_ipv4.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv4/tcp_ipv4.c 2015-09-08 00:48:31.504590523 -0300 @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -235,6 +236,21 @@ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); +#ifdef CONFIG_TCP_STEALTH + /* If CONFIG_TCP_STEALTH is defined, we need to know the timestamp as + * early as possible and thus move taking the snapshot of tcp_time_stamp + * here. + */ + skb_mstamp_get(&tp->stealth.mstamp); + + if (!tp->write_seq && likely(!tp->repair) && + unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) + tp->write_seq = tcp_stealth_sequence_number(sk, + &inet->inet_daddr, + sizeof(inet->inet_daddr), + usin->sin_port); +#endif + if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, inet->inet_daddr, @@ -1382,6 +1398,8 @@ */ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + struct tcphdr *th = tcp_hdr(skb); struct sock *rsk; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ @@ -1403,6 +1421,15 @@ if (tcp_checksum_complete(skb)) goto csum_err; +#ifdef CONFIG_TCP_STEALTH + if (sk->sk_state == TCP_LISTEN && th->syn && !th->fin && + unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH) && + tcp_stealth_do_auth(sk, skb)) { + rsk = sk; + goto reset; + } +#endif + if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) diff -Nur linux-4.2/net/ipv4/tcp_output.c linux-4.2-pck/net/ipv4/tcp_output.c --- linux-4.2/net/ipv4/tcp_output.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv4/tcp_output.c 2015-09-08 00:48:31.504590523 -0300 @@ -939,6 +939,13 @@ tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); +#ifdef TCP_STEALTH + if (unlikely(tcb->tcp_flags & TCPHDR_SYN && + tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) { + skb->skb_mstamp = tp->stealth.mstamp; + } +#endif + if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); else @@ -3251,7 +3258,15 @@ return -ENOBUFS; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); +#ifdef CONFIG_TCP_STEALTH + /* The timetamp was already made at the time the ISN was generated + * as we need to know its value in the stealth_tcp_sequence_number() + * function. + */ + tp->retrans_stamp = tp->stealth.mstamp.stamp_jiffies; +#else tp->retrans_stamp = tcp_time_stamp; +#endif tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); diff -Nur linux-4.2/net/ipv6/tcp_ipv6.c linux-4.2-pck/net/ipv6/tcp_ipv6.c --- linux-4.2/net/ipv6/tcp_ipv6.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv6/tcp_ipv6.c 2015-09-08 00:48:31.504590523 -0300 @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -278,6 +279,21 @@ ip6_set_txhash(sk); +#ifdef CONFIG_TCP_STEALTH + /* If CONFIG_TCP_STEALTH is defined, we need to know the timestamp as + * early as possible and thus move taking the snapshot of tcp_time_stamp + * here. + */ + skb_mstamp_get(&tp->stealth.mstamp); + + if (!tp->write_seq && likely(!tp->repair) && + unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) + tp->write_seq = tcp_stealth_sequence_number(sk, + sk->sk_v6_daddr.s6_addr32, + sizeof(sk->sk_v6_daddr), + inet->inet_dport); +#endif + if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, @@ -1191,7 +1207,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp_sock *tp; + struct tcp_sock *tp = tcp_sk(sk); + struct tcphdr *th = tcp_hdr(skb); struct sk_buff *opt_skb = NULL; /* Imagine: socket is IPv6. IPv4 packet arrives, @@ -1251,6 +1268,13 @@ if (tcp_checksum_complete(skb)) goto csum_err; +#ifdef CONFIG_TCP_STEALTH + if (sk->sk_state == TCP_LISTEN && th->syn && !th->fin && + tp->stealth.mode & TCP_STEALTH_MODE_AUTH && + tcp_stealth_do_auth(sk, skb)) + goto reset; +#endif + if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_hnd_req(sk, skb); if (!nsk) diff -Nur linux-4.2/net/netfilter/nf_conntrack_core.c linux-4.2-pck/net/netfilter/nf_conntrack_core.c --- linux-4.2/net/netfilter/nf_conntrack_core.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/netfilter/nf_conntrack_core.c 2015-09-08 00:48:22.241696581 -0300 @@ -320,12 +320,13 @@ } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); -static void nf_ct_tmpl_free(struct nf_conn *tmpl) +void nf_ct_tmpl_free(struct nf_conn *tmpl) { nf_ct_ext_destroy(tmpl); nf_ct_ext_free(tmpl); kfree(tmpl); } +EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); static void destroy_conntrack(struct nf_conntrack *nfct) diff -Nur linux-4.2/net/netfilter/nf_synproxy_core.c linux-4.2-pck/net/netfilter/nf_synproxy_core.c --- linux-4.2/net/netfilter/nf_synproxy_core.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/netfilter/nf_synproxy_core.c 2015-09-08 00:48:22.241696581 -0300 @@ -378,7 +378,7 @@ err3: free_percpu(snet->stats); err2: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err1: return err; } diff -Nur linux-4.2/net/netfilter/xt_CT.c linux-4.2-pck/net/netfilter/xt_CT.c --- linux-4.2/net/netfilter/xt_CT.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/netfilter/xt_CT.c 2015-09-08 00:48:22.241696581 -0300 @@ -233,7 +233,7 @@ return 0; err3: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err2: nf_ct_l3proto_module_put(par->family); err1: diff -Nur linux-4.2/samples/Kconfig linux-4.2-pck/samples/Kconfig --- linux-4.2/samples/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/samples/Kconfig 2015-09-08 00:48:22.241696581 -0300 @@ -55,6 +55,13 @@ Build an example of how to dynamically add the hello command to the kdb shell. +config SAMPLE_KDBUS + bool "Build kdbus API example" + depends on KDBUS + help + Build an example of how the kdbus API can be used from + userspace. + config SAMPLE_RPMSG_CLIENT tristate "Build rpmsg client sample -- loadable modules only" depends on RPMSG && m diff -Nur linux-4.2/samples/Makefile linux-4.2-pck/samples/Makefile --- linux-4.2/samples/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/samples/Makefile 2015-09-08 00:48:22.241696581 -0300 @@ -1,4 +1,5 @@ # Makefile for Linux samples code obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ trace_events/ livepatch/ \ - hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ + hw_breakpoint/ kfifo/ kdb/ kdbus/ hidraw/ rpmsg/ \ + seccomp/ diff -Nur linux-4.2/samples/kdbus/.gitignore linux-4.2-pck/samples/kdbus/.gitignore --- linux-4.2/samples/kdbus/.gitignore 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/samples/kdbus/.gitignore 2015-09-08 00:48:22.241696581 -0300 @@ -0,0 +1 @@ +kdbus-workers diff -Nur linux-4.2/samples/kdbus/Makefile linux-4.2-pck/samples/kdbus/Makefile --- linux-4.2/samples/kdbus/Makefile 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/samples/kdbus/Makefile 2015-09-08 00:48:22.241696581 -0300 @@ -0,0 +1,9 @@ +# kbuild trick to avoid linker error. Can be omitted if a module is built. +obj- := dummy.o + +hostprogs-$(CONFIG_SAMPLE_KDBUS) += kdbus-workers + +always := $(hostprogs-y) + +HOSTCFLAGS_kdbus-workers.o += -I$(objtree)/usr/include +HOSTLOADLIBES_kdbus-workers := -lrt diff -Nur linux-4.2/samples/kdbus/kdbus-api.h linux-4.2-pck/samples/kdbus/kdbus-api.h --- linux-4.2/samples/kdbus/kdbus-api.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/samples/kdbus/kdbus-api.h 2015-09-08 00:48:22.241696581 -0300 @@ -0,0 +1,114 @@ +#ifndef KDBUS_API_H +#define KDBUS_API_H + +#include +#include + +#define KDBUS_ALIGN8(l) (((l) + 7) & ~7) +#define KDBUS_ITEM_HEADER_SIZE offsetof(struct kdbus_item, data) +#define KDBUS_ITEM_SIZE(s) KDBUS_ALIGN8((s) + KDBUS_ITEM_HEADER_SIZE) +#define KDBUS_ITEM_NEXT(item) \ + (typeof(item))((uint8_t *)(item) + KDBUS_ALIGN8((item)->size)) +#define KDBUS_FOREACH(iter, first, _size) \ + for ((iter) = (first); \ + ((uint8_t *)(iter) < (uint8_t *)(first) + (_size)) && \ + ((uint8_t *)(iter) >= (uint8_t *)(first)); \ + (iter) = (void *)((uint8_t *)(iter) + KDBUS_ALIGN8((iter)->size))) + +static inline int kdbus_cmd_bus_make(int control_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(control_fd, KDBUS_CMD_BUS_MAKE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_endpoint_make(int bus_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(bus_fd, KDBUS_CMD_ENDPOINT_MAKE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_endpoint_update(int ep_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(ep_fd, KDBUS_CMD_ENDPOINT_UPDATE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_hello(int bus_fd, struct kdbus_cmd_hello *cmd) +{ + int ret = ioctl(bus_fd, KDBUS_CMD_HELLO, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_update(int fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(fd, KDBUS_CMD_UPDATE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_byebye(int conn_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_BYEBYE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_free(int conn_fd, struct kdbus_cmd_free *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_FREE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_conn_info(int conn_fd, struct kdbus_cmd_info *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_CONN_INFO, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_bus_creator_info(int conn_fd, struct kdbus_cmd_info *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_BUS_CREATOR_INFO, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_list(int fd, struct kdbus_cmd_list *cmd) +{ + int ret = ioctl(fd, KDBUS_CMD_LIST, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_send(int conn_fd, struct kdbus_cmd_send *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_SEND, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_recv(int conn_fd, struct kdbus_cmd_recv *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_RECV, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_name_acquire(int conn_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_NAME_ACQUIRE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_name_release(int conn_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_NAME_RELEASE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_match_add(int conn_fd, struct kdbus_cmd_match *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_MATCH_ADD, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_match_remove(int conn_fd, struct kdbus_cmd_match *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_MATCH_REMOVE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +#endif /* KDBUS_API_H */ diff -Nur linux-4.2/samples/kdbus/kdbus-workers.c linux-4.2-pck/samples/kdbus/kdbus-workers.c --- linux-4.2/samples/kdbus/kdbus-workers.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/samples/kdbus/kdbus-workers.c 2015-09-08 00:48:22.241696581 -0300 @@ -0,0 +1,1346 @@ +/* + * Copyright (C) 2013-2015 David Herrmann + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +/* + * Example: Workers + * This program computes prime-numbers based on the sieve of Eratosthenes. The + * master sets up a shared memory region and spawns workers which clear out the + * non-primes. The master reacts to keyboard input and to client-requests to + * control what each worker does. Note that this is in no way meant as efficient + * way to compute primes. It should only serve as example how a master/worker + * concept can be implemented with kdbus used as control messages. + * + * The main process is called the 'master'. It creates a new, private bus which + * will be used between the master and its workers to communicate. The master + * then spawns a fixed number of workers. Whenever a worker dies (detected via + * SIGCHLD), the master spawns a new worker. When done, the master waits for all + * workers to exit, prints a status report and exits itself. + * + * The master process does *not* keep track of its workers. Instead, this + * example implements a PULL model. That is, the master acquires a well-known + * name on the bus which each worker uses to request tasks from the master. If + * there are no more tasks, the master will return an empty task-list, which + * casues a worker to exit immediately. + * + * As tasks can be computationally expensive, we support cancellation. Whenever + * the master process is interrupted, it will drop its well-known name on the + * bus. This causes kdbus to broadcast a name-change notification. The workers + * check for broadcast messages regularly and will exit if they receive one. + * + * This example exists of 4 objects: + * * master: The master object contains the context of the master process. This + * process manages the prime-context, spawns workers and assigns + * prime-ranges to each worker to compute. + * The master itself does not do any prime-computations itself. + * * child: The child object contains the context of a worker. It inherits the + * prime context from its parent (the master) and then creates a new + * bus context to request prime-ranges to compute. + * * prime: The "prime" object is used to abstract how we compute primes. When + * allocated, it prepares a memory region to hold 1 bit for each + * natural number up to a fixed maximum ('MAX_PRIMES'). + * The memory region is backed by a memfd which we share between + * processes. Each worker now gets assigned a range of natural + * numbers which it clears multiples of off the memory region. The + * master process is responsible of distributing all natural numbers + * up to the fixed maximum to its workers. + * * bus: The bus object is an abstraction of the kdbus API. It is pretty + * straightfoward and only manages the connection-fd plus the + * memory-mapped pool in a single object. + * + * This example is in reversed order, which should make it easier to read + * top-down, but requires some forward-declarations. Just ignore those. + */ + +#include +#include +#include + +/* glibc < 2.7 does not ship sys/signalfd.h */ +/* we require kernels with __NR_memfd_create */ +#if __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 7 && defined(__NR_memfd_create) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kdbus-api.h" + +/* FORWARD DECLARATIONS */ + +#define POOL_SIZE (16 * 1024 * 1024) +#define MAX_PRIMES (2UL << 24) +#define WORKER_COUNT (16) +#define PRIME_STEPS (65536 * 4) + +static const char *arg_busname = "example-workers"; +static const char *arg_modname = "kdbus"; +static const char *arg_master = "org.freedesktop.master"; + +static int err_assert(int r_errno, const char *msg, const char *func, int line, + const char *file) +{ + r_errno = (r_errno != 0) ? -abs(r_errno) : -EFAULT; + if (r_errno < 0) { + errno = -r_errno; + fprintf(stderr, "ERR: %s: %m (%s:%d in %s)\n", + msg, func, line, file); + } + return r_errno; +} + +#define err_r(_r, _msg) err_assert((_r), (_msg), __func__, __LINE__, __FILE__) +#define err(_msg) err_r(errno, (_msg)) + +struct prime; +struct bus; +struct master; +struct child; + +struct prime { + int fd; + uint8_t *area; + size_t max; + size_t done; + size_t status; +}; + +static int prime_new(struct prime **out); +static void prime_free(struct prime *p); +static bool prime_done(struct prime *p); +static void prime_consume(struct prime *p, size_t amount); +static int prime_run(struct prime *p, struct bus *cancel, size_t number); +static void prime_print(struct prime *p); + +struct bus { + int fd; + uint8_t *pool; +}; + +static int bus_open_connection(struct bus **out, uid_t uid, const char *name, + uint64_t recv_flags); +static void bus_close_connection(struct bus *b); +static void bus_poool_free_slice(struct bus *b, uint64_t offset); +static int bus_acquire_name(struct bus *b, const char *name); +static int bus_install_name_loss_match(struct bus *b, const char *name); +static int bus_poll(struct bus *b); +static int bus_make(uid_t uid, const char *name); + +struct master { + size_t n_workers; + size_t max_workers; + + int signal_fd; + int control_fd; + + struct prime *prime; + struct bus *bus; +}; + +static int master_new(struct master **out); +static void master_free(struct master *m); +static int master_run(struct master *m); +static int master_poll(struct master *m); +static int master_handle_stdin(struct master *m); +static int master_handle_signal(struct master *m); +static int master_handle_bus(struct master *m); +static int master_reply(struct master *m, const struct kdbus_msg *msg); +static int master_waitpid(struct master *m); +static int master_spawn(struct master *m); + +struct child { + struct bus *bus; + struct prime *prime; +}; + +static int child_new(struct child **out, struct prime *p); +static void child_free(struct child *c); +static int child_run(struct child *c); + +/* END OF FORWARD DECLARATIONS */ + +/* + * This is the main entrypoint of this example. It is pretty straightforward. We + * create a master object, run the computation, print a status report and then + * exit. Nothing particularly interesting here, so lets look into the master + * object... + */ +int main(int argc, char **argv) +{ + struct master *m = NULL; + int r; + + r = master_new(&m); + if (r < 0) + goto out; + + r = master_run(m); + if (r < 0) + goto out; + + if (0) + prime_print(m->prime); + +out: + master_free(m); + if (r < 0 && r != -EINTR) + fprintf(stderr, "failed\n"); + else + fprintf(stderr, "done\n"); + return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS; +} + +/* + * ...this will allocate a new master context. It keeps track of the current + * number of children/workers that are running, manages a signalfd to track + * SIGCHLD, and creates a private kdbus bus. Afterwards, it opens its connection + * to the bus and acquires a well known-name (arg_master). + */ +static int master_new(struct master **out) +{ + struct master *m; + sigset_t smask; + int r; + + m = calloc(1, sizeof(*m)); + if (!m) + return err("cannot allocate master"); + + m->max_workers = WORKER_COUNT; + m->signal_fd = -1; + m->control_fd = -1; + + /* Block SIGINT and SIGCHLD signals */ + sigemptyset(&smask); + sigaddset(&smask, SIGINT); + sigaddset(&smask, SIGCHLD); + sigprocmask(SIG_BLOCK, &smask, NULL); + + m->signal_fd = signalfd(-1, &smask, SFD_CLOEXEC); + if (m->signal_fd < 0) { + r = err("cannot create signalfd"); + goto error; + } + + r = prime_new(&m->prime); + if (r < 0) + goto error; + + m->control_fd = bus_make(getuid(), arg_busname); + if (m->control_fd < 0) { + r = m->control_fd; + goto error; + } + + /* + * Open a bus connection for the master, and require each received + * message to have a metadata item of type KDBUS_ITEM_PIDS attached. + * The current UID is needed to compute the name of the bus node to + * connect to. + */ + r = bus_open_connection(&m->bus, getuid(), + arg_busname, KDBUS_ATTACH_PIDS); + if (r < 0) + goto error; + + /* + * Acquire a well-known name on the bus, so children can address + * messages to the master using KDBUS_DST_ID_NAME as destination-ID + * of messages. + */ + r = bus_acquire_name(m->bus, arg_master); + if (r < 0) + goto error; + + *out = m; + return 0; + +error: + master_free(m); + return r; +} + +/* pretty straightforward destructor of a master object */ +static void master_free(struct master *m) +{ + if (!m) + return; + + bus_close_connection(m->bus); + if (m->control_fd >= 0) + close(m->control_fd); + prime_free(m->prime); + if (m->signal_fd >= 0) + close(m->signal_fd); + free(m); +} + +static int master_run(struct master *m) +{ + int res, r = 0; + + while (!prime_done(m->prime)) { + while (m->n_workers < m->max_workers) { + r = master_spawn(m); + if (r < 0) + break; + } + + r = master_poll(m); + if (r < 0) + break; + } + + if (r < 0) { + bus_close_connection(m->bus); + m->bus = NULL; + } + + while (m->n_workers > 0) { + res = master_poll(m); + if (res < 0) { + if (m->bus) { + bus_close_connection(m->bus); + m->bus = NULL; + } + r = res; + } + } + + return r == -EINTR ? 0 : r; +} + +static int master_poll(struct master *m) +{ + struct pollfd fds[3] = {}; + int r = 0, n = 0; + + /* + * Add stdin, the eventfd and the connection owner file descriptor to + * the pollfd table, and handle incoming traffic on the latter in + * master_handle_bus(). + */ + fds[n].fd = STDIN_FILENO; + fds[n++].events = POLLIN; + fds[n].fd = m->signal_fd; + fds[n++].events = POLLIN; + if (m->bus) { + fds[n].fd = m->bus->fd; + fds[n++].events = POLLIN; + } + + r = poll(fds, n, -1); + if (r < 0) + return err("poll() failed"); + + if (fds[0].revents & POLLIN) + r = master_handle_stdin(m); + else if (fds[0].revents) + r = err("ERR/HUP on stdin"); + if (r < 0) + return r; + + if (fds[1].revents & POLLIN) + r = master_handle_signal(m); + else if (fds[1].revents) + r = err("ERR/HUP on signalfd"); + if (r < 0) + return r; + + if (fds[2].revents & POLLIN) + r = master_handle_bus(m); + else if (fds[2].revents) + r = err("ERR/HUP on bus"); + + return r; +} + +static int master_handle_stdin(struct master *m) +{ + char buf[128]; + ssize_t l; + int r = 0; + + l = read(STDIN_FILENO, buf, sizeof(buf)); + if (l < 0) + return err("cannot read stdin"); + if (l == 0) + return err_r(-EINVAL, "EOF on stdin"); + + while (l-- > 0) { + switch (buf[l]) { + case 'q': + /* quit */ + r = -EINTR; + break; + case '\n': + case ' ': + /* ignore */ + break; + default: + if (isgraph(buf[l])) + fprintf(stderr, "invalid input '%c'\n", buf[l]); + else + fprintf(stderr, "invalid input 0x%x\n", buf[l]); + break; + } + } + + return r; +} + +static int master_handle_signal(struct master *m) +{ + struct signalfd_siginfo val; + ssize_t l; + + l = read(m->signal_fd, &val, sizeof(val)); + if (l < 0) + return err("cannot read signalfd"); + if (l != sizeof(val)) + return err_r(-EINVAL, "invalid data from signalfd"); + + switch (val.ssi_signo) { + case SIGCHLD: + return master_waitpid(m); + case SIGINT: + return err_r(-EINTR, "interrupted"); + default: + return err_r(-EINVAL, "caught invalid signal"); + } +} + +static int master_handle_bus(struct master *m) +{ + struct kdbus_cmd_recv recv = { .size = sizeof(recv) }; + const struct kdbus_msg *msg = NULL; + const struct kdbus_item *item; + const struct kdbus_vec *vec = NULL; + int r = 0; + + /* + * To receive a message, the KDBUS_CMD_RECV ioctl is used. + * It takes an argument of type 'struct kdbus_cmd_recv', which + * will contain information on the received message when the call + * returns. See kdbus.message(7). + */ + r = kdbus_cmd_recv(m->bus->fd, &recv); + /* + * EAGAIN is returned when there is no message waiting on this + * connection. This is not an error - simply bail out. + */ + if (r == -EAGAIN) + return 0; + if (r < 0) + return err_r(r, "cannot receive message"); + + /* + * Messages received by a connection are stored inside the connection's + * pool, at an offset that has been returned in the 'recv' command + * struct above. The value describes the relative offset from the + * start address of the pool. A message is described with + * 'struct kdbus_msg'. See kdbus.message(7). + */ + msg = (void *)(m->bus->pool + recv.msg.offset); + + /* + * A messages describes its actual payload in an array of items. + * KDBUS_FOREACH() is a simple iterator that walks such an array. + * struct kdbus_msg has a field to denote its total size, which is + * needed to determine the number of items in the array. + */ + KDBUS_FOREACH(item, msg->items, + msg->size - offsetof(struct kdbus_msg, items)) { + /* + * An item of type PAYLOAD_OFF describes in-line memory + * stored in the pool at a described offset. That offset is + * relative to the start address of the message header. + * This example program only expects one single item of that + * type, remembers the struct kdbus_vec member of the item + * when it sees it, and bails out if there is more than one + * of them. + */ + if (item->type == KDBUS_ITEM_PAYLOAD_OFF) { + if (vec) { + r = err_r(-EEXIST, + "message with multiple vecs"); + break; + } + vec = &item->vec; + if (vec->size != 1) { + r = err_r(-EINVAL, "invalid message size"); + break; + } + + /* + * MEMFDs are transported as items of type PAYLOAD_MEMFD. + * If such an item is attached, a new file descriptor was + * installed into the task when KDBUS_CMD_RECV was called, and + * its number is stored in item->memfd.fd. + * Implementers *must* handle this item type and close the + * file descriptor when no longer needed in order to prevent + * file descriptor exhaustion. This example program just bails + * out with an error in this case, as memfds are not expected + * in this context. + */ + } else if (item->type == KDBUS_ITEM_PAYLOAD_MEMFD) { + r = err_r(-EINVAL, "message with memfd"); + break; + } + } + if (r < 0) + goto exit; + if (!vec) { + r = err_r(-EINVAL, "empty message"); + goto exit; + } + + switch (*((const uint8_t *)msg + vec->offset)) { + case 'r': { + r = master_reply(m, msg); + break; + } + default: + r = err_r(-EINVAL, "invalid message type"); + break; + } + +exit: + /* + * We are done with the memory slice that was given to us through + * recv.msg.offset. Tell the kernel it can use it for other content + * in the future. See kdbus.pool(7). + */ + bus_poool_free_slice(m->bus, recv.msg.offset); + return r; +} + +static int master_reply(struct master *m, const struct kdbus_msg *msg) +{ + struct kdbus_cmd_send cmd; + struct kdbus_item *item; + struct kdbus_msg *reply; + size_t size, status, p[2]; + int r; + + /* + * This functions sends a message over kdbus. To do this, it uses the + * KDBUS_CMD_SEND ioctl, which takes a command struct argument of type + * 'struct kdbus_cmd_send'. This struct stores a pointer to the actual + * message to send. See kdbus.message(7). + */ + p[0] = m->prime->done; + p[1] = prime_done(m->prime) ? 0 : PRIME_STEPS; + + size = sizeof(*reply); + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_vec)); + + /* Prepare the message to send */ + reply = alloca(size); + memset(reply, 0, size); + reply->size = size; + + /* Each message has a cookie that can be used to send replies */ + reply->cookie = 1; + + /* The payload_type is arbitrary, but it must be non-zero */ + reply->payload_type = 0xdeadbeef; + + /* + * We are sending a reply. Let the kernel know the cookie of the + * message we are replying to. + */ + reply->cookie_reply = msg->cookie; + + /* + * Messages can either be directed to a well-known name (stored as + * string) or to a unique name (stored as number). This example does + * the latter. If the message would be directed to a well-known name + * instead, the message's dst_id field would be set to + * KDBUS_DST_ID_NAME, and the name would be attaches in an item of type + * KDBUS_ITEM_DST_NAME. See below for an example, and also refer to + * kdbus.message(7). + */ + reply->dst_id = msg->src_id; + + /* Our message has exactly one item to store its payload */ + item = reply->items; + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)p; + item->vec.size = sizeof(p); + + /* + * Now prepare the command struct, and reference the message we want + * to send. + */ + memset(&cmd, 0, sizeof(cmd)); + cmd.size = sizeof(cmd); + cmd.msg_address = (uintptr_t)reply; + + /* + * Finally, employ the command on the connection owner + * file descriptor. + */ + r = kdbus_cmd_send(m->bus->fd, &cmd); + if (r < 0) + return err_r(r, "cannot send reply"); + + if (p[1]) { + prime_consume(m->prime, p[1]); + status = m->prime->done * 10000 / m->prime->max; + if (status != m->prime->status) { + m->prime->status = status; + fprintf(stderr, "status: %7.3lf%%\n", + (double)status / 100); + } + } + + return 0; +} + +static int master_waitpid(struct master *m) +{ + pid_t pid; + int r; + + while ((pid = waitpid(-1, &r, WNOHANG)) > 0) { + if (m->n_workers > 0) + --m->n_workers; + if (!WIFEXITED(r)) + r = err_r(-EINVAL, "child died unexpectedly"); + else if (WEXITSTATUS(r) != 0) + r = err_r(-WEXITSTATUS(r), "child failed"); + } + + return r; +} + +static int master_spawn(struct master *m) +{ + struct child *c = NULL; + struct prime *p = NULL; + pid_t pid; + int r; + + /* Spawn off one child and call child_run() inside it */ + + pid = fork(); + if (pid < 0) + return err("cannot fork"); + if (pid > 0) { + /* parent */ + ++m->n_workers; + return 0; + } + + /* child */ + + p = m->prime; + m->prime = NULL; + master_free(m); + + r = child_new(&c, p); + if (r < 0) + goto exit; + + r = child_run(c); + +exit: + child_free(c); + exit(abs(r)); +} + +static int child_new(struct child **out, struct prime *p) +{ + struct child *c; + int r; + + c = calloc(1, sizeof(*c)); + if (!c) + return err("cannot allocate child"); + + c->prime = p; + + /* + * Open a connection to the bus and require each received message to + * carry a list of the well-known names the sendind connection currently + * owns. The current UID is needed in order to determine the name of the + * bus node to connect to. + */ + r = bus_open_connection(&c->bus, getuid(), + arg_busname, KDBUS_ATTACH_NAMES); + if (r < 0) + goto error; + + /* + * Install a kdbus match so the child's connection gets notified when + * the master loses its well-known name. + */ + r = bus_install_name_loss_match(c->bus, arg_master); + if (r < 0) + goto error; + + *out = c; + return 0; + +error: + child_free(c); + return r; +} + +static void child_free(struct child *c) +{ + if (!c) + return; + + bus_close_connection(c->bus); + prime_free(c->prime); + free(c); +} + +static int child_run(struct child *c) +{ + struct kdbus_cmd_send cmd; + struct kdbus_item *item; + struct kdbus_vec *vec = NULL; + struct kdbus_msg *msg; + struct timespec spec; + size_t n, steps, size; + int r = 0; + + /* + * Let's send a message to the master and ask for work. To do this, + * we use the KDBUS_CMD_SEND ioctl, which takes an argument of type + * 'struct kdbus_cmd_send'. This struct stores a pointer to the actual + * message to send. See kdbus.message(7). + */ + size = sizeof(*msg); + size += KDBUS_ITEM_SIZE(strlen(arg_master) + 1); + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_vec)); + + msg = alloca(size); + memset(msg, 0, size); + msg->size = size; + + /* + * Tell the kernel that we expect a reply to this message. This means + * that + * + * a) The remote peer will gain temporary permission to talk to us + * even if it would not be allowed to normally. + * + * b) A timeout value is required. + * + * For asynchronous send commands, if no reply is received, we will + * get a kernel notification with an item of type + * KDBUS_ITEM_REPLY_TIMEOUT attached. + * + * For synchronous send commands (which this example does), the + * ioctl will block until a reply is received or the timeout is + * exceeded. + */ + msg->flags = KDBUS_MSG_EXPECT_REPLY; + + /* Set our cookie. Replies must use this cookie to send their reply. */ + msg->cookie = 1; + + /* The payload_type is arbitrary, but it must be non-zero */ + msg->payload_type = 0xdeadbeef; + + /* + * We are sending our message to the current owner of a well-known + * name. This makes an item of type KDBUS_ITEM_DST_NAME mandatory. + */ + msg->dst_id = KDBUS_DST_ID_NAME; + + /* + * Set the reply timeout to 5 seconds. Timeouts are always set in + * absolute timestamps, based con CLOCK_MONOTONIC. See kdbus.message(7). + */ + clock_gettime(CLOCK_MONOTONIC_COARSE, &spec); + msg->timeout_ns += (5 + spec.tv_sec) * 1000ULL * 1000ULL * 1000ULL; + msg->timeout_ns += spec.tv_nsec; + + /* + * Fill the appended items. First, set the well-known name of the + * destination we want to talk to. + */ + item = msg->items; + item->type = KDBUS_ITEM_DST_NAME; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(arg_master) + 1; + strcpy(item->str, arg_master); + + /* + * The 2nd item contains a vector to memory we want to send. It + * can be content of any type. In our case, we're sending a one-byte + * string only. The memory referenced by this item will be copied into + * the pool of the receiver connection, and does not need to be valid + * after the command is employed. + */ + item = KDBUS_ITEM_NEXT(item); + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)"r"; + item->vec.size = 1; + + /* Set up the command struct and reference the message we prepared */ + memset(&cmd, 0, sizeof(cmd)); + cmd.size = sizeof(cmd); + cmd.msg_address = (uintptr_t)msg; + + /* + * The send commands knows a mode in which it will block until a + * reply to a message is received. This example uses that mode. + * The pool offset to the received reply will be stored in the command + * struct after the send command returned. See below. + */ + cmd.flags = KDBUS_SEND_SYNC_REPLY; + + /* + * Finally, employ the command on the connection owner + * file descriptor. + */ + r = kdbus_cmd_send(c->bus->fd, &cmd); + if (r == -ESRCH || r == -EPIPE || r == -ECONNRESET) + return 0; + if (r < 0) + return err_r(r, "cannot send request to master"); + + /* + * The command was sent with the KDBUS_SEND_SYNC_REPLY flag set, + * and returned successfully, which means that cmd.reply.offset now + * points to a message inside our connection's pool where the reply + * is found. This is equivalent to receiving the reply with + * KDBUS_CMD_RECV, but it doesn't require waiting for the reply with + * poll() and also saves the ioctl to receive the message. + */ + msg = (void *)(c->bus->pool + cmd.reply.offset); + + /* + * A messages describes its actual payload in an array of items. + * KDBUS_FOREACH() is a simple iterator that walks such an array. + * struct kdbus_msg has a field to denote its total size, which is + * needed to determine the number of items in the array. + */ + KDBUS_FOREACH(item, msg->items, + msg->size - offsetof(struct kdbus_msg, items)) { + /* + * An item of type PAYLOAD_OFF describes in-line memory + * stored in the pool at a described offset. That offset is + * relative to the start address of the message header. + * This example program only expects one single item of that + * type, remembers the struct kdbus_vec member of the item + * when it sees it, and bails out if there is more than one + * of them. + */ + if (item->type == KDBUS_ITEM_PAYLOAD_OFF) { + if (vec) { + r = err_r(-EEXIST, + "message with multiple vecs"); + break; + } + vec = &item->vec; + if (vec->size != 2 * sizeof(size_t)) { + r = err_r(-EINVAL, "invalid message size"); + break; + } + /* + * MEMFDs are transported as items of type PAYLOAD_MEMFD. + * If such an item is attached, a new file descriptor was + * installed into the task when KDBUS_CMD_RECV was called, and + * its number is stored in item->memfd.fd. + * Implementers *must* handle this item type close the + * file descriptor when no longer needed in order to prevent + * file descriptor exhaustion. This example program just bails + * out with an error in this case, as memfds are not expected + * in this context. + */ + } else if (item->type == KDBUS_ITEM_PAYLOAD_MEMFD) { + r = err_r(-EINVAL, "message with memfd"); + break; + } + } + if (r < 0) + goto exit; + if (!vec) { + r = err_r(-EINVAL, "empty message"); + goto exit; + } + + n = ((size_t *)((const uint8_t *)msg + vec->offset))[0]; + steps = ((size_t *)((const uint8_t *)msg + vec->offset))[1]; + + while (steps-- > 0) { + ++n; + r = prime_run(c->prime, c->bus, n); + if (r < 0) + break; + r = bus_poll(c->bus); + if (r != 0) { + r = r < 0 ? r : -EINTR; + break; + } + } + +exit: + /* + * We are done with the memory slice that was given to us through + * cmd.reply.offset. Tell the kernel it can use it for other content + * in the future. See kdbus.pool(7). + */ + bus_poool_free_slice(c->bus, cmd.reply.offset); + return r; +} + +/* + * Prime Computation + * + */ + +static int prime_new(struct prime **out) +{ + struct prime *p; + int r; + + p = calloc(1, sizeof(*p)); + if (!p) + return err("cannot allocate prime memory"); + + p->fd = -1; + p->area = MAP_FAILED; + p->max = MAX_PRIMES; + + /* + * Prepare and map a memfd to store the bit-fields for the number + * ranges we want to perform the prime detection on. + */ + p->fd = syscall(__NR_memfd_create, "prime-area", MFD_CLOEXEC); + if (p->fd < 0) { + r = err("cannot create memfd"); + goto error; + } + + r = ftruncate(p->fd, p->max / 8 + 1); + if (r < 0) { + r = err("cannot ftruncate area"); + goto error; + } + + p->area = mmap(NULL, p->max / 8 + 1, PROT_READ | PROT_WRITE, + MAP_SHARED, p->fd, 0); + if (p->area == MAP_FAILED) { + r = err("cannot mmap memfd"); + goto error; + } + + *out = p; + return 0; + +error: + prime_free(p); + return r; +} + +static void prime_free(struct prime *p) +{ + if (!p) + return; + + if (p->area != MAP_FAILED) + munmap(p->area, p->max / 8 + 1); + if (p->fd >= 0) + close(p->fd); + free(p); +} + +static bool prime_done(struct prime *p) +{ + return p->done >= p->max; +} + +static void prime_consume(struct prime *p, size_t amount) +{ + p->done += amount; +} + +static int prime_run(struct prime *p, struct bus *cancel, size_t number) +{ + size_t i, n = 0; + int r; + + if (number < 2 || number > 65535) + return 0; + + for (i = number * number; + i < p->max && i > number; + i += number) { + p->area[i / 8] |= 1 << (i % 8); + + if (!(++n % (1 << 20))) { + r = bus_poll(cancel); + if (r != 0) + return r < 0 ? r : -EINTR; + } + } + + return 0; +} + +static void prime_print(struct prime *p) +{ + size_t i, l = 0; + + fprintf(stderr, "PRIMES:"); + for (i = 0; i < p->max; ++i) { + if (!(p->area[i / 8] & (1 << (i % 8)))) + fprintf(stderr, "%c%7zu", !(l++ % 16) ? '\n' : ' ', i); + } + fprintf(stderr, "\nEND\n"); +} + +static int bus_open_connection(struct bus **out, uid_t uid, const char *name, + uint64_t recv_flags) +{ + struct kdbus_cmd_hello hello; + char path[128]; + struct bus *b; + int r; + + /* + * The 'bus' object is our representation of a kdbus connection which + * stores two details: the connection owner file descriptor, and the + * mmap()ed memory of its associated pool. See kdbus.connection(7) and + * kdbus.pool(7). + */ + b = calloc(1, sizeof(*b)); + if (!b) + return err("cannot allocate bus memory"); + + b->fd = -1; + b->pool = MAP_FAILED; + + /* Compute the name of the bus node to connect to. */ + snprintf(path, sizeof(path), "/sys/fs/%s/%lu-%s/bus", + arg_modname, (unsigned long)uid, name); + b->fd = open(path, O_RDWR | O_CLOEXEC); + if (b->fd < 0) { + r = err("cannot open bus"); + goto error; + } + + /* + * To make a connection to the bus, the KDBUS_CMD_HELLO ioctl is used. + * It takes an argument of type 'struct kdbus_cmd_hello'. + */ + memset(&hello, 0, sizeof(hello)); + hello.size = sizeof(hello); + + /* + * Specify a mask of metadata attach flags, describing metadata items + * that this new connection allows to be sent. + */ + hello.attach_flags_send = _KDBUS_ATTACH_ALL; + + /* + * Specify a mask of metadata attach flags, describing metadata items + * that this new connection wants to be receive along with each message. + */ + hello.attach_flags_recv = recv_flags; + + /* + * A connection may choose the size of its pool, but the number has to + * comply with two rules: a) it must be greater than 0, and b) it must + * be a mulitple of PAGE_SIZE. See kdbus.pool(7). + */ + hello.pool_size = POOL_SIZE; + + /* + * Now employ the command on the file descriptor opened above. + * This command will turn the file descriptor into a connection-owner + * file descriptor that controls the life-time of the connection; once + * it's closed, the connection is shut down. + */ + r = kdbus_cmd_hello(b->fd, &hello); + if (r < 0) { + err_r(r, "HELLO failed"); + goto error; + } + + bus_poool_free_slice(b, hello.offset); + + /* + * Map the pool of the connection. Its size has been set in the + * command struct above. See kdbus.pool(7). + */ + b->pool = mmap(NULL, POOL_SIZE, PROT_READ, MAP_SHARED, b->fd, 0); + if (b->pool == MAP_FAILED) { + r = err("cannot mmap pool"); + goto error; + } + + *out = b; + return 0; + +error: + bus_close_connection(b); + return r; +} + +static void bus_close_connection(struct bus *b) +{ + if (!b) + return; + + /* + * A bus connection is closed by simply calling close() on the + * connection owner file descriptor. The unique name and all owned + * well-known names of the conneciton will disappear. + * See kdbus.connection(7). + */ + if (b->pool != MAP_FAILED) + munmap(b->pool, POOL_SIZE); + if (b->fd >= 0) + close(b->fd); + free(b); +} + +static void bus_poool_free_slice(struct bus *b, uint64_t offset) +{ + struct kdbus_cmd_free cmd = { + .size = sizeof(cmd), + .offset = offset, + }; + int r; + + /* + * Once we're done with a piece of pool memory that was returned + * by a command, we have to call the KDBUS_CMD_FREE ioctl on it so it + * can be reused. The command takes an argument of type + * 'struct kdbus_cmd_free', in which the pool offset of the slice to + * free is stored. The ioctl is employed on the connection owner + * file descriptor. See kdbus.pool(7), + */ + r = kdbus_cmd_free(b->fd, &cmd); + if (r < 0) + err_r(r, "cannot free pool slice"); +} + +static int bus_acquire_name(struct bus *b, const char *name) +{ + struct kdbus_item *item; + struct kdbus_cmd *cmd; + size_t size; + int r; + + /* + * This function acquires a well-known name on the bus through the + * KDBUS_CMD_NAME_ACQUIRE ioctl. This ioctl takes an argument of type + * 'struct kdbus_cmd', which is assembled below. See kdbus.name(7). + */ + size = sizeof(*cmd); + size += KDBUS_ITEM_SIZE(strlen(name) + 1); + + cmd = alloca(size); + memset(cmd, 0, size); + cmd->size = size; + + /* + * The command requires an item of type KDBUS_ITEM_NAME, and its + * content must be a valid bus name. + */ + item = cmd->items; + item->type = KDBUS_ITEM_NAME; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + strcpy(item->str, name); + + /* + * Employ the command on the connection owner file descriptor. + */ + r = kdbus_cmd_name_acquire(b->fd, cmd); + if (r < 0) + return err_r(r, "cannot acquire name"); + + return 0; +} + +static int bus_install_name_loss_match(struct bus *b, const char *name) +{ + struct kdbus_cmd_match *match; + struct kdbus_item *item; + size_t size; + int r; + + /* + * In order to install a match for signal messages, we have to + * assemble a 'struct kdbus_cmd_match' and use it along with the + * KDBUS_CMD_MATCH_ADD ioctl. See kdbus.match(7). + */ + size = sizeof(*match); + size += KDBUS_ITEM_SIZE(sizeof(item->name_change) + strlen(name) + 1); + + match = alloca(size); + memset(match, 0, size); + match->size = size; + + /* + * A match is comprised of many 'rules', each of which describes a + * mandatory detail of the message. All rules of a match must be + * satified in order to make a message pass. + */ + item = match->items; + + /* + * In this case, we're interested in notifications that inform us + * about a well-known name being removed from the bus. + */ + item->type = KDBUS_ITEM_NAME_REMOVE; + item->size = KDBUS_ITEM_HEADER_SIZE + + sizeof(item->name_change) + strlen(name) + 1; + + /* + * We could limit the match further and require a specific unique-ID + * to be the new or the old owner of the name. In this case, however, + * we don't, and allow 'any' id. + */ + item->name_change.old_id.id = KDBUS_MATCH_ID_ANY; + item->name_change.new_id.id = KDBUS_MATCH_ID_ANY; + + /* Copy in the well-known name we're interested in */ + strcpy(item->name_change.name, name); + + /* + * Add the match through the KDBUS_CMD_MATCH_ADD ioctl, employed on + * the connection owner fd. + */ + r = kdbus_cmd_match_add(b->fd, match); + if (r < 0) + return err_r(r, "cannot add match"); + + return 0; +} + +static int bus_poll(struct bus *b) +{ + struct pollfd fds[1] = {}; + int r; + + /* + * A connection endpoint supports poll() and will wake-up the + * task with POLLIN set once a message has arrived. + */ + fds[0].fd = b->fd; + fds[0].events = POLLIN; + r = poll(fds, sizeof(fds) / sizeof(*fds), 0); + if (r < 0) + return err("cannot poll bus"); + return !!(fds[0].revents & POLLIN); +} + +static int bus_make(uid_t uid, const char *name) +{ + struct kdbus_item *item; + struct kdbus_cmd *make; + char path[128], busname[128]; + size_t size; + int r, fd; + + /* + * Compute the full path to the 'control' node. 'arg_modname' may be + * set to a different value than 'kdbus' for development purposes. + * The 'control' node is the primary entry point to kdbus that must be + * used in order to create a bus. See kdbus(7) and kdbus.bus(7). + */ + snprintf(path, sizeof(path), "/sys/fs/%s/control", arg_modname); + + /* + * Compute the bus name. A valid bus name must always be prefixed with + * the EUID of the currently running process in order to avoid name + * conflicts. See kdbus.bus(7). + */ + snprintf(busname, sizeof(busname), "%lu-%s", (unsigned long)uid, name); + + fd = open(path, O_RDWR | O_CLOEXEC); + if (fd < 0) + return err("cannot open control file"); + + /* + * The KDBUS_CMD_BUS_MAKE ioctl takes an argument of type + * 'struct kdbus_cmd', and expects at least two items attached to + * it: one to decribe the bloom parameters to be propagated to + * connections of the bus, and the name of the bus that was computed + * above. Assemble this struct now, and fill it with values. + */ + size = sizeof(*make); + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_bloom_parameter)); + size += KDBUS_ITEM_SIZE(strlen(busname) + 1); + + make = alloca(size); + memset(make, 0, size); + make->size = size; + + /* + * Each item has a 'type' and 'size' field, and must be stored at an + * 8-byte aligned address. The KDBUS_ITEM_NEXT macro is used to advance + * the pointer. See kdbus.item(7) for more details. + */ + item = make->items; + item->type = KDBUS_ITEM_BLOOM_PARAMETER; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(item->bloom_parameter); + item->bloom_parameter.size = 8; + item->bloom_parameter.n_hash = 1; + + /* The name of the new bus is stored in the next item. */ + item = KDBUS_ITEM_NEXT(item); + item->type = KDBUS_ITEM_MAKE_NAME; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(busname) + 1; + strcpy(item->str, busname); + + /* + * Now create the bus via the KDBUS_CMD_BUS_MAKE ioctl and return the + * fd that was used back to the caller of this function. This fd is now + * called a 'bus owner file descriptor', and it controls the life-time + * of the newly created bus; once the file descriptor is closed, the + * bus goes away, and all connections are shut down. See kdbus.bus(7). + */ + r = kdbus_cmd_bus_make(fd, make); + if (r < 0) { + err_r(r, "cannot make bus"); + close(fd); + return r; + } + + return fd; +} + +#else + +#warning "Skipping compilation due to unsupported libc version" + +int main(int argc, char **argv) +{ + fprintf(stderr, + "Compilation of %s was skipped due to unsupported libc.\n", + argv[0]); + + return EXIT_FAILURE; +} + +#endif /* libc sanity check */ diff -Nur linux-4.2/scripts/mkcompile_h linux-4.2-pck/scripts/mkcompile_h --- linux-4.2/scripts/mkcompile_h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/scripts/mkcompile_h 2015-09-08 00:48:22.245029757 -0300 @@ -54,8 +54,8 @@ fi UTS_VERSION="#$VERSION" -CONFIG_FLAGS="" -if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi +CONFIG_FLAGS="PCK" +if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" diff -Nur linux-4.2/tools/testing/selftests/Makefile linux-4.2-pck/tools/testing/selftests/Makefile --- linux-4.2/tools/testing/selftests/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/Makefile 2015-09-08 00:48:22.245029757 -0300 @@ -6,6 +6,7 @@ TARGETS += ftrace TARGETS += futex TARGETS += kcmp +TARGETS += kdbus TARGETS += memfd TARGETS += memory-hotplug TARGETS += mount diff -Nur linux-4.2/tools/testing/selftests/kdbus/.gitignore linux-4.2-pck/tools/testing/selftests/kdbus/.gitignore --- linux-4.2/tools/testing/selftests/kdbus/.gitignore 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/.gitignore 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1 @@ +kdbus-test diff -Nur linux-4.2/tools/testing/selftests/kdbus/Makefile linux-4.2-pck/tools/testing/selftests/kdbus/Makefile --- linux-4.2/tools/testing/selftests/kdbus/Makefile 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/Makefile 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,49 @@ +CFLAGS += -I../../../../usr/include/ +CFLAGS += -I../../../../samples/kdbus/ +CFLAGS += -I../../../../include/uapi/ +CFLAGS += -std=gnu99 +CFLAGS += -DKBUILD_MODNAME=\"kdbus\" -D_GNU_SOURCE +LDLIBS = -pthread -lcap -lm + +OBJS= \ + kdbus-enum.o \ + kdbus-util.o \ + kdbus-test.o \ + kdbus-test.o \ + test-activator.o \ + test-benchmark.o \ + test-bus.o \ + test-chat.o \ + test-connection.o \ + test-daemon.o \ + test-endpoint.o \ + test-fd.o \ + test-free.o \ + test-match.o \ + test-message.o \ + test-metadata-ns.o \ + test-monitor.o \ + test-names.o \ + test-policy.o \ + test-policy-ns.o \ + test-policy-priv.o \ + test-sync.o \ + test-timeout.o + +all: kdbus-test + +include ../lib.mk + +%.o: %.c kdbus-enum.h kdbus-test.h kdbus-util.h + $(CC) $(CFLAGS) -c $< -o $@ + +kdbus-test: $(OBJS) + $(CC) $(CFLAGS) $^ $(LDLIBS) -o $@ + +TEST_PROGS := kdbus-test + +run_tests: + ./kdbus-test --tap + +clean: + rm -f *.o kdbus-test diff -Nur linux-4.2/tools/testing/selftests/kdbus/kdbus-enum.c linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-enum.c --- linux-4.2/tools/testing/selftests/kdbus/kdbus-enum.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-enum.c 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kdbus-util.h" +#include "kdbus-enum.h" + +struct kdbus_enum_table { + long long id; + const char *name; +}; + +#define TABLE(what) static struct kdbus_enum_table kdbus_table_##what[] +#define ENUM(_id) { .id = _id, .name = STRINGIFY(_id) } +#define LOOKUP(what) \ + const char *enum_##what(long long id) \ + { \ + for (size_t i = 0; i < ELEMENTSOF(kdbus_table_##what); i++) \ + if (id == kdbus_table_##what[i].id) \ + return kdbus_table_##what[i].name; \ + return "UNKNOWN"; \ + } + +TABLE(CMD) = { + ENUM(KDBUS_CMD_BUS_MAKE), + ENUM(KDBUS_CMD_ENDPOINT_MAKE), + ENUM(KDBUS_CMD_HELLO), + ENUM(KDBUS_CMD_SEND), + ENUM(KDBUS_CMD_RECV), + ENUM(KDBUS_CMD_LIST), + ENUM(KDBUS_CMD_NAME_RELEASE), + ENUM(KDBUS_CMD_CONN_INFO), + ENUM(KDBUS_CMD_MATCH_ADD), + ENUM(KDBUS_CMD_MATCH_REMOVE), +}; +LOOKUP(CMD); + +TABLE(MSG) = { + ENUM(_KDBUS_ITEM_NULL), + ENUM(KDBUS_ITEM_PAYLOAD_VEC), + ENUM(KDBUS_ITEM_PAYLOAD_OFF), + ENUM(KDBUS_ITEM_PAYLOAD_MEMFD), + ENUM(KDBUS_ITEM_FDS), + ENUM(KDBUS_ITEM_BLOOM_PARAMETER), + ENUM(KDBUS_ITEM_BLOOM_FILTER), + ENUM(KDBUS_ITEM_DST_NAME), + ENUM(KDBUS_ITEM_MAKE_NAME), + ENUM(KDBUS_ITEM_ATTACH_FLAGS_SEND), + ENUM(KDBUS_ITEM_ATTACH_FLAGS_RECV), + ENUM(KDBUS_ITEM_ID), + ENUM(KDBUS_ITEM_NAME), + ENUM(KDBUS_ITEM_TIMESTAMP), + ENUM(KDBUS_ITEM_CREDS), + ENUM(KDBUS_ITEM_PIDS), + ENUM(KDBUS_ITEM_AUXGROUPS), + ENUM(KDBUS_ITEM_OWNED_NAME), + ENUM(KDBUS_ITEM_TID_COMM), + ENUM(KDBUS_ITEM_PID_COMM), + ENUM(KDBUS_ITEM_EXE), + ENUM(KDBUS_ITEM_CMDLINE), + ENUM(KDBUS_ITEM_CGROUP), + ENUM(KDBUS_ITEM_CAPS), + ENUM(KDBUS_ITEM_SECLABEL), + ENUM(KDBUS_ITEM_AUDIT), + ENUM(KDBUS_ITEM_CONN_DESCRIPTION), + ENUM(KDBUS_ITEM_NAME_ADD), + ENUM(KDBUS_ITEM_NAME_REMOVE), + ENUM(KDBUS_ITEM_NAME_CHANGE), + ENUM(KDBUS_ITEM_ID_ADD), + ENUM(KDBUS_ITEM_ID_REMOVE), + ENUM(KDBUS_ITEM_REPLY_TIMEOUT), + ENUM(KDBUS_ITEM_REPLY_DEAD), +}; +LOOKUP(MSG); + +TABLE(PAYLOAD) = { + ENUM(KDBUS_PAYLOAD_KERNEL), + ENUM(KDBUS_PAYLOAD_DBUS), +}; +LOOKUP(PAYLOAD); diff -Nur linux-4.2/tools/testing/selftests/kdbus/kdbus-enum.h linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-enum.h --- linux-4.2/tools/testing/selftests/kdbus/kdbus-enum.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-enum.h 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,15 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#pragma once + +const char *enum_CMD(long long id); +const char *enum_MSG(long long id); +const char *enum_MATCH(long long id); +const char *enum_PAYLOAD(long long id); diff -Nur linux-4.2/tools/testing/selftests/kdbus/kdbus-test.c linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-test.c --- linux-4.2/tools/testing/selftests/kdbus/kdbus-test.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-test.c 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,905 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kdbus-util.h" +#include "kdbus-enum.h" +#include "kdbus-test.h" + +enum { + TEST_CREATE_BUS = 1 << 0, + TEST_CREATE_CONN = 1 << 1, +}; + +struct kdbus_test { + const char *name; + const char *desc; + int (*func)(struct kdbus_test_env *env); + unsigned int flags; +}; + +struct kdbus_test_args { + bool mntns; + bool pidns; + bool userns; + char *uid_map; + char *gid_map; + int loop; + int wait; + int fork; + int tap_output; + char *module; + char *root; + char *test; + char *busname; +}; + +static const struct kdbus_test tests[] = { + { + .name = "bus-make", + .desc = "bus make functions", + .func = kdbus_test_bus_make, + .flags = 0, + }, + { + .name = "hello", + .desc = "the HELLO command", + .func = kdbus_test_hello, + .flags = TEST_CREATE_BUS, + }, + { + .name = "byebye", + .desc = "the BYEBYE command", + .func = kdbus_test_byebye, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "chat", + .desc = "a chat pattern", + .func = kdbus_test_chat, + .flags = TEST_CREATE_BUS, + }, + { + .name = "daemon", + .desc = "a simple daemon", + .func = kdbus_test_daemon, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "fd-passing", + .desc = "file descriptor passing", + .func = kdbus_test_fd_passing, + .flags = TEST_CREATE_BUS, + }, + { + .name = "endpoint", + .desc = "custom endpoint", + .func = kdbus_test_custom_endpoint, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "monitor", + .desc = "monitor functionality", + .func = kdbus_test_monitor, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "name-basics", + .desc = "basic name registry functions", + .func = kdbus_test_name_basic, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "name-conflict", + .desc = "name registry conflict details", + .func = kdbus_test_name_conflict, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "name-queue", + .desc = "queuing of names", + .func = kdbus_test_name_queue, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "name-takeover", + .desc = "takeover of names", + .func = kdbus_test_name_takeover, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "message-basic", + .desc = "basic message handling", + .func = kdbus_test_message_basic, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "message-prio", + .desc = "handling of messages with priority", + .func = kdbus_test_message_prio, + .flags = TEST_CREATE_BUS, + }, + { + .name = "message-quota", + .desc = "message quotas are enforced", + .func = kdbus_test_message_quota, + .flags = TEST_CREATE_BUS, + }, + { + .name = "memory-access", + .desc = "memory access", + .func = kdbus_test_memory_access, + .flags = TEST_CREATE_BUS, + }, + { + .name = "timeout", + .desc = "timeout", + .func = kdbus_test_timeout, + .flags = TEST_CREATE_BUS, + }, + { + .name = "sync-byebye", + .desc = "synchronous replies vs. BYEBYE", + .func = kdbus_test_sync_byebye, + .flags = TEST_CREATE_BUS, + }, + { + .name = "sync-reply", + .desc = "synchronous replies", + .func = kdbus_test_sync_reply, + .flags = TEST_CREATE_BUS, + }, + { + .name = "message-free", + .desc = "freeing of memory", + .func = kdbus_test_free, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "connection-info", + .desc = "retrieving connection information", + .func = kdbus_test_conn_info, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "connection-update", + .desc = "updating connection information", + .func = kdbus_test_conn_update, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "writable-pool", + .desc = "verifying pools are never writable", + .func = kdbus_test_writable_pool, + .flags = TEST_CREATE_BUS, + }, + { + .name = "policy", + .desc = "policy", + .func = kdbus_test_policy, + .flags = TEST_CREATE_BUS, + }, + { + .name = "policy-priv", + .desc = "unprivileged bus access", + .func = kdbus_test_policy_priv, + .flags = TEST_CREATE_BUS, + }, + { + .name = "policy-ns", + .desc = "policy in user namespaces", + .func = kdbus_test_policy_ns, + .flags = TEST_CREATE_BUS, + }, + { + .name = "metadata-ns", + .desc = "metadata in different namespaces", + .func = kdbus_test_metadata_ns, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-id-add", + .desc = "adding of matches by id", + .func = kdbus_test_match_id_add, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-id-remove", + .desc = "removing of matches by id", + .func = kdbus_test_match_id_remove, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-replace", + .desc = "replace of matches with the same cookie", + .func = kdbus_test_match_replace, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-name-add", + .desc = "adding of matches by name", + .func = kdbus_test_match_name_add, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-name-remove", + .desc = "removing of matches by name", + .func = kdbus_test_match_name_remove, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-name-change", + .desc = "matching for name changes", + .func = kdbus_test_match_name_change, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-bloom", + .desc = "matching with bloom filters", + .func = kdbus_test_match_bloom, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "activator", + .desc = "activator connections", + .func = kdbus_test_activator, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "benchmark", + .desc = "benchmark", + .func = kdbus_test_benchmark, + .flags = TEST_CREATE_BUS, + }, + { + .name = "benchmark-nomemfds", + .desc = "benchmark without using memfds", + .func = kdbus_test_benchmark_nomemfds, + .flags = TEST_CREATE_BUS, + }, + { + .name = "benchmark-uds", + .desc = "benchmark comparison to UDS", + .func = kdbus_test_benchmark_uds, + .flags = TEST_CREATE_BUS, + }, +}; + +#define N_TESTS ((int) (sizeof(tests) / sizeof(tests[0]))) + +static int test_prepare_env(const struct kdbus_test *t, + const struct kdbus_test_args *args, + struct kdbus_test_env *env) +{ + if (t->flags & TEST_CREATE_BUS) { + char *s; + char *n = NULL; + int ret; + + asprintf(&s, "%s/control", args->root); + + env->control_fd = open(s, O_RDWR); + free(s); + ASSERT_RETURN(env->control_fd >= 0); + + if (!args->busname) { + n = unique_name("test-bus"); + ASSERT_RETURN(n); + } + + ret = kdbus_create_bus(env->control_fd, + args->busname ?: n, + _KDBUS_ATTACH_ALL, &s); + free(n); + ASSERT_RETURN(ret == 0); + + asprintf(&env->buspath, "%s/%s/bus", args->root, s); + free(s); + } + + if (t->flags & TEST_CREATE_CONN) { + env->conn = kdbus_hello(env->buspath, 0, NULL, 0); + ASSERT_RETURN(env->conn); + } + + env->root = args->root; + env->module = args->module; + + return 0; +} + +void test_unprepare_env(const struct kdbus_test *t, struct kdbus_test_env *env) +{ + if (env->conn) { + kdbus_conn_free(env->conn); + env->conn = NULL; + } + + if (env->control_fd >= 0) { + close(env->control_fd); + env->control_fd = -1; + } + + if (env->buspath) { + free(env->buspath); + env->buspath = NULL; + } +} + +static int test_run(const struct kdbus_test *t, + const struct kdbus_test_args *kdbus_args, + int wait) +{ + int ret; + struct kdbus_test_env env = {}; + + ret = test_prepare_env(t, kdbus_args, &env); + if (ret != TEST_OK) + return ret; + + if (wait > 0) { + printf("Sleeping %d seconds before running test ...\n", wait); + sleep(wait); + } + + ret = t->func(&env); + test_unprepare_env(t, &env); + return ret; +} + +static int test_run_forked(const struct kdbus_test *t, + const struct kdbus_test_args *kdbus_args, + int wait) +{ + int ret; + pid_t pid; + + pid = fork(); + if (pid < 0) { + return TEST_ERR; + } else if (pid == 0) { + ret = test_run(t, kdbus_args, wait); + _exit(ret); + } + + pid = waitpid(pid, &ret, 0); + if (pid <= 0) + return TEST_ERR; + else if (!WIFEXITED(ret)) + return TEST_ERR; + else + return WEXITSTATUS(ret); +} + +static void print_test_result(int ret) +{ + switch (ret) { + case TEST_OK: + printf("OK"); + break; + case TEST_SKIP: + printf("SKIPPED"); + break; + case TEST_ERR: + printf("ERROR"); + break; + } +} + +static int start_all_tests(struct kdbus_test_args *kdbus_args) +{ + int ret; + unsigned int fail_cnt = 0; + unsigned int skip_cnt = 0; + unsigned int ok_cnt = 0; + unsigned int i; + + if (kdbus_args->tap_output) { + printf("1..%d\n", N_TESTS); + fflush(stdout); + } + + kdbus_util_verbose = false; + + for (i = 0; i < N_TESTS; i++) { + const struct kdbus_test *t = tests + i; + + if (!kdbus_args->tap_output) { + unsigned int n; + + printf("Testing %s (%s) ", t->desc, t->name); + for (n = 0; n < 60 - strlen(t->desc) - strlen(t->name); n++) + printf("."); + printf(" "); + } + + ret = test_run_forked(t, kdbus_args, 0); + switch (ret) { + case TEST_OK: + ok_cnt++; + break; + case TEST_SKIP: + skip_cnt++; + break; + case TEST_ERR: + fail_cnt++; + break; + } + + if (kdbus_args->tap_output) { + printf("%sok %d - %s%s (%s)\n", + (ret == TEST_ERR) ? "not " : "", i + 1, + (ret == TEST_SKIP) ? "# SKIP " : "", + t->desc, t->name); + fflush(stdout); + } else { + print_test_result(ret); + printf("\n"); + } + } + + if (kdbus_args->tap_output) + printf("Failed %d/%d tests, %.2f%% okay\n", fail_cnt, N_TESTS, + 100.0 - (fail_cnt * 100.0) / ((float) N_TESTS)); + else + printf("\nSUMMARY: %u tests passed, %u skipped, %u failed\n", + ok_cnt, skip_cnt, fail_cnt); + + return fail_cnt > 0 ? TEST_ERR : TEST_OK; +} + +static int start_one_test(struct kdbus_test_args *kdbus_args) +{ + int i, ret; + bool test_found = false; + + for (i = 0; i < N_TESTS; i++) { + const struct kdbus_test *t = tests + i; + + if (strcmp(t->name, kdbus_args->test)) + continue; + + do { + test_found = true; + if (kdbus_args->fork) + ret = test_run_forked(t, kdbus_args, + kdbus_args->wait); + else + ret = test_run(t, kdbus_args, + kdbus_args->wait); + + printf("Testing %s: ", t->desc); + print_test_result(ret); + printf("\n"); + + if (ret != TEST_OK) + break; + } while (kdbus_args->loop); + + return ret; + } + + if (!test_found) { + printf("Unknown test-id '%s'\n", kdbus_args->test); + return TEST_ERR; + } + + return TEST_OK; +} + +static void usage(const char *argv0) +{ + unsigned int i, j; + + printf("Usage: %s [options]\n" + "Options:\n" + "\t-a, --tap Output test results in TAP format\n" + "\t-m, --module Kdbus module name\n" + "\t-x, --loop Run in a loop\n" + "\t-f, --fork Fork before running a test\n" + "\t-h, --help Print this help\n" + "\t-r, --root Toplevel of the kdbus hierarchy\n" + "\t-t, --test Run one specific test only, in verbose mode\n" + "\t-b, --bus Instead of generating a random bus name, take .\n" + "\t-w, --wait Wait before actually starting test\n" + "\t --mntns New mount namespace\n" + "\t --pidns New PID namespace\n" + "\t --userns New user namespace\n" + "\t --uidmap uid_map UID map for user namespace\n" + "\t --gidmap gid_map GID map for user namespace\n" + "\n", argv0); + + printf("By default, all test are run once, and a summary is printed.\n" + "Available tests for --test:\n\n"); + + for (i = 0; i < N_TESTS; i++) { + const struct kdbus_test *t = tests + i; + + printf("\t%s", t->name); + + for (j = 0; j < 24 - strlen(t->name); j++) + printf(" "); + + printf("Test %s\n", t->desc); + } + + printf("\n"); + printf("Note that some tests may, if run specifically by --test, " + "behave differently, and not terminate by themselves.\n"); + + exit(EXIT_FAILURE); +} + +void print_kdbus_test_args(struct kdbus_test_args *args) +{ + if (args->userns || args->pidns || args->mntns) + printf("# Starting tests in new %s%s%s namespaces%s\n", + args->mntns ? "MOUNT " : "", + args->pidns ? "PID " : "", + args->userns ? "USER " : "", + args->mntns ? ", kdbusfs will be remounted" : ""); + else + printf("# Starting tests in the same namespaces\n"); +} + +void print_metadata_support(void) +{ + bool no_meta_audit, no_meta_cgroups, no_meta_seclabel; + + /* + * KDBUS_ATTACH_CGROUP, KDBUS_ATTACH_AUDIT and + * KDBUS_ATTACH_SECLABEL + */ + no_meta_audit = !config_auditsyscall_is_enabled(); + no_meta_cgroups = !config_cgroups_is_enabled(); + no_meta_seclabel = !config_security_is_enabled(); + + if (no_meta_audit | no_meta_cgroups | no_meta_seclabel) + printf("# Starting tests without %s%s%s metadata support\n", + no_meta_audit ? "AUDIT " : "", + no_meta_cgroups ? "CGROUP " : "", + no_meta_seclabel ? "SECLABEL " : ""); + else + printf("# Starting tests with full metadata support\n"); +} + +int run_tests(struct kdbus_test_args *kdbus_args) +{ + int ret; + static char control[4096]; + + snprintf(control, sizeof(control), "%s/control", kdbus_args->root); + + if (access(control, W_OK) < 0) { + printf("Unable to locate control node at '%s'.\n", + control); + return TEST_ERR; + } + + if (kdbus_args->test) { + ret = start_one_test(kdbus_args); + } else { + do { + ret = start_all_tests(kdbus_args); + if (ret != TEST_OK) + break; + } while (kdbus_args->loop); + } + + return ret; +} + +static void nop_handler(int sig) {} + +static int test_prepare_mounts(struct kdbus_test_args *kdbus_args) +{ + int ret; + char kdbusfs[64] = {'\0'}; + + snprintf(kdbusfs, sizeof(kdbusfs), "%sfs", kdbus_args->module); + + /* make current mount slave */ + ret = mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL); + if (ret < 0) { + ret = -errno; + printf("error mount() root: %d (%m)\n", ret); + return ret; + } + + /* Remount procfs since we need it in our tests */ + if (kdbus_args->pidns) { + ret = mount("proc", "/proc", "proc", + MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (ret < 0) { + ret = -errno; + printf("error mount() /proc : %d (%m)\n", ret); + return ret; + } + } + + /* Remount kdbusfs */ + ret = mount(kdbusfs, kdbus_args->root, kdbusfs, + MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (ret < 0) { + ret = -errno; + printf("error mount() %s :%d (%m)\n", kdbusfs, ret); + return ret; + } + + return 0; +} + +int run_tests_in_namespaces(struct kdbus_test_args *kdbus_args) +{ + int ret; + int efd = -1; + int status; + pid_t pid, rpid; + struct sigaction oldsa; + struct sigaction sa = { + .sa_handler = nop_handler, + .sa_flags = SA_NOCLDSTOP, + }; + + efd = eventfd(0, EFD_CLOEXEC); + if (efd < 0) { + ret = -errno; + printf("eventfd() failed: %d (%m)\n", ret); + return TEST_ERR; + } + + ret = sigaction(SIGCHLD, &sa, &oldsa); + if (ret < 0) { + ret = -errno; + printf("sigaction() failed: %d (%m)\n", ret); + return TEST_ERR; + } + + /* setup namespaces */ + pid = syscall(__NR_clone, SIGCHLD| + (kdbus_args->userns ? CLONE_NEWUSER : 0) | + (kdbus_args->mntns ? CLONE_NEWNS : 0) | + (kdbus_args->pidns ? CLONE_NEWPID : 0), NULL); + if (pid < 0) { + printf("clone() failed: %d (%m)\n", -errno); + return TEST_ERR; + } + + if (pid == 0) { + eventfd_t event_status = 0; + + ret = prctl(PR_SET_PDEATHSIG, SIGKILL); + if (ret < 0) { + ret = -errno; + printf("error prctl(): %d (%m)\n", ret); + _exit(TEST_ERR); + } + + /* reset sighandlers of childs */ + ret = sigaction(SIGCHLD, &oldsa, NULL); + if (ret < 0) { + ret = -errno; + printf("sigaction() failed: %d (%m)\n", ret); + _exit(TEST_ERR); + } + + ret = eventfd_read(efd, &event_status); + if (ret < 0 || event_status != 1) { + printf("error eventfd_read()\n"); + _exit(TEST_ERR); + } + + if (kdbus_args->mntns) { + ret = test_prepare_mounts(kdbus_args); + if (ret < 0) { + printf("error preparing mounts\n"); + _exit(TEST_ERR); + } + } + + ret = run_tests(kdbus_args); + _exit(ret); + } + + /* Setup userns mapping */ + if (kdbus_args->userns) { + ret = userns_map_uid_gid(pid, kdbus_args->uid_map, + kdbus_args->gid_map); + if (ret < 0) { + printf("error mapping uid and gid in userns\n"); + eventfd_write(efd, 2); + return TEST_ERR; + } + } + + ret = eventfd_write(efd, 1); + if (ret < 0) { + ret = -errno; + printf("error eventfd_write(): %d (%m)\n", ret); + return TEST_ERR; + } + + rpid = waitpid(pid, &status, 0); + ASSERT_RETURN_VAL(rpid == pid, TEST_ERR); + + close(efd); + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) + return TEST_ERR; + + return TEST_OK; +} + +int start_tests(struct kdbus_test_args *kdbus_args) +{ + int ret; + bool namespaces; + static char fspath[4096]; + + namespaces = (kdbus_args->mntns || kdbus_args->pidns || + kdbus_args->userns); + + /* for pidns we need mntns set */ + if (kdbus_args->pidns && !kdbus_args->mntns) { + printf("Failed: please set both pid and mnt namesapces\n"); + return TEST_ERR; + } + + if (kdbus_args->userns) { + if (!config_user_ns_is_enabled()) { + printf("User namespace not supported\n"); + return TEST_ERR; + } + + if (!kdbus_args->uid_map || !kdbus_args->gid_map) { + printf("Failed: please specify uid or gid mapping\n"); + return TEST_ERR; + } + } + + print_kdbus_test_args(kdbus_args); + print_metadata_support(); + + /* setup kdbus paths */ + if (!kdbus_args->module) + kdbus_args->module = "kdbus"; + + if (!kdbus_args->root) { + snprintf(fspath, sizeof(fspath), "/sys/fs/%s", + kdbus_args->module); + kdbus_args->root = fspath; + } + + /* Start tests */ + if (namespaces) + ret = run_tests_in_namespaces(kdbus_args); + else + ret = run_tests(kdbus_args); + + return ret; +} + +int main(int argc, char *argv[]) +{ + int t, ret = 0; + struct kdbus_test_args *kdbus_args; + enum { + ARG_MNTNS = 0x100, + ARG_PIDNS, + ARG_USERNS, + ARG_UIDMAP, + ARG_GIDMAP, + }; + + kdbus_args = malloc(sizeof(*kdbus_args)); + if (!kdbus_args) { + printf("unable to malloc() kdbus_args\n"); + return EXIT_FAILURE; + } + + memset(kdbus_args, 0, sizeof(*kdbus_args)); + + static const struct option options[] = { + { "loop", no_argument, NULL, 'x' }, + { "help", no_argument, NULL, 'h' }, + { "root", required_argument, NULL, 'r' }, + { "test", required_argument, NULL, 't' }, + { "bus", required_argument, NULL, 'b' }, + { "wait", required_argument, NULL, 'w' }, + { "fork", no_argument, NULL, 'f' }, + { "module", required_argument, NULL, 'm' }, + { "tap", no_argument, NULL, 'a' }, + { "mntns", no_argument, NULL, ARG_MNTNS }, + { "pidns", no_argument, NULL, ARG_PIDNS }, + { "userns", no_argument, NULL, ARG_USERNS }, + { "uidmap", required_argument, NULL, ARG_UIDMAP }, + { "gidmap", required_argument, NULL, ARG_GIDMAP }, + {} + }; + + srand(time(NULL)); + + while ((t = getopt_long(argc, argv, "hxfm:r:t:b:w:a", options, NULL)) >= 0) { + switch (t) { + case 'x': + kdbus_args->loop = 1; + break; + + case 'm': + kdbus_args->module = optarg; + break; + + case 'r': + kdbus_args->root = optarg; + break; + + case 't': + kdbus_args->test = optarg; + break; + + case 'b': + kdbus_args->busname = optarg; + break; + + case 'w': + kdbus_args->wait = strtol(optarg, NULL, 10); + break; + + case 'f': + kdbus_args->fork = 1; + break; + + case 'a': + kdbus_args->tap_output = 1; + break; + + case ARG_MNTNS: + kdbus_args->mntns = true; + break; + + case ARG_PIDNS: + kdbus_args->pidns = true; + break; + + case ARG_USERNS: + kdbus_args->userns = true; + break; + + case ARG_UIDMAP: + kdbus_args->uid_map = optarg; + break; + + case ARG_GIDMAP: + kdbus_args->gid_map = optarg; + break; + + default: + case 'h': + usage(argv[0]); + } + } + + ret = start_tests(kdbus_args); + if (ret == TEST_ERR) + return EXIT_FAILURE; + + free(kdbus_args); + + return 0; +} diff -Nur linux-4.2/tools/testing/selftests/kdbus/kdbus-test.h linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-test.h --- linux-4.2/tools/testing/selftests/kdbus/kdbus-test.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-test.h 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,84 @@ +#ifndef _TEST_KDBUS_H_ +#define _TEST_KDBUS_H_ + +struct kdbus_test_env { + char *buspath; + const char *root; + const char *module; + int control_fd; + struct kdbus_conn *conn; +}; + +enum { + TEST_OK, + TEST_SKIP, + TEST_ERR, +}; + +#define ASSERT_RETURN_VAL(cond, val) \ + if (!(cond)) { \ + fprintf(stderr, "Assertion '%s' failed in %s(), %s:%d\n", \ + #cond, __func__, __FILE__, __LINE__); \ + return val; \ + } + +#define ASSERT_EXIT_VAL(cond, val) \ + if (!(cond)) { \ + fprintf(stderr, "Assertion '%s' failed in %s(), %s:%d\n", \ + #cond, __func__, __FILE__, __LINE__); \ + _exit(val); \ + } + +#define ASSERT_BREAK(cond) \ + if (!(cond)) { \ + fprintf(stderr, "Assertion '%s' failed in %s(), %s:%d\n", \ + #cond, __func__, __FILE__, __LINE__); \ + break; \ + } + +#define ASSERT_RETURN(cond) \ + ASSERT_RETURN_VAL(cond, TEST_ERR) + +#define ASSERT_EXIT(cond) \ + ASSERT_EXIT_VAL(cond, EXIT_FAILURE) + +int kdbus_test_activator(struct kdbus_test_env *env); +int kdbus_test_benchmark(struct kdbus_test_env *env); +int kdbus_test_benchmark_nomemfds(struct kdbus_test_env *env); +int kdbus_test_benchmark_uds(struct kdbus_test_env *env); +int kdbus_test_bus_make(struct kdbus_test_env *env); +int kdbus_test_byebye(struct kdbus_test_env *env); +int kdbus_test_chat(struct kdbus_test_env *env); +int kdbus_test_conn_info(struct kdbus_test_env *env); +int kdbus_test_conn_update(struct kdbus_test_env *env); +int kdbus_test_daemon(struct kdbus_test_env *env); +int kdbus_test_custom_endpoint(struct kdbus_test_env *env); +int kdbus_test_fd_passing(struct kdbus_test_env *env); +int kdbus_test_free(struct kdbus_test_env *env); +int kdbus_test_hello(struct kdbus_test_env *env); +int kdbus_test_match_bloom(struct kdbus_test_env *env); +int kdbus_test_match_id_add(struct kdbus_test_env *env); +int kdbus_test_match_id_remove(struct kdbus_test_env *env); +int kdbus_test_match_replace(struct kdbus_test_env *env); +int kdbus_test_match_name_add(struct kdbus_test_env *env); +int kdbus_test_match_name_change(struct kdbus_test_env *env); +int kdbus_test_match_name_remove(struct kdbus_test_env *env); +int kdbus_test_message_basic(struct kdbus_test_env *env); +int kdbus_test_message_prio(struct kdbus_test_env *env); +int kdbus_test_message_quota(struct kdbus_test_env *env); +int kdbus_test_memory_access(struct kdbus_test_env *env); +int kdbus_test_metadata_ns(struct kdbus_test_env *env); +int kdbus_test_monitor(struct kdbus_test_env *env); +int kdbus_test_name_basic(struct kdbus_test_env *env); +int kdbus_test_name_conflict(struct kdbus_test_env *env); +int kdbus_test_name_queue(struct kdbus_test_env *env); +int kdbus_test_name_takeover(struct kdbus_test_env *env); +int kdbus_test_policy(struct kdbus_test_env *env); +int kdbus_test_policy_ns(struct kdbus_test_env *env); +int kdbus_test_policy_priv(struct kdbus_test_env *env); +int kdbus_test_sync_byebye(struct kdbus_test_env *env); +int kdbus_test_sync_reply(struct kdbus_test_env *env); +int kdbus_test_timeout(struct kdbus_test_env *env); +int kdbus_test_writable_pool(struct kdbus_test_env *env); + +#endif /* _TEST_KDBUS_H_ */ diff -Nur linux-4.2/tools/testing/selftests/kdbus/kdbus-util.c linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-util.c --- linux-4.2/tools/testing/selftests/kdbus/kdbus-util.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-util.c 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,1612 @@ +/* + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __NR_memfd_create + #ifdef __x86_64__ + #define __NR_memfd_create 319 + #elif defined __arm__ + #define __NR_memfd_create 385 + #else + #define __NR_memfd_create 356 + #endif +#endif + +#include "kdbus-api.h" +#include "kdbus-util.h" +#include "kdbus-enum.h" + +#ifndef F_ADD_SEALS +#define F_LINUX_SPECIFIC_BASE 1024 +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +#endif + +int kdbus_util_verbose = true; + +int kdbus_sysfs_get_parameter_mask(const char *path, uint64_t *mask) +{ + int ret; + FILE *file; + unsigned long long value; + + file = fopen(path, "r"); + if (!file) { + ret = -errno; + kdbus_printf("--- error fopen(): %d (%m)\n", ret); + return ret; + } + + ret = fscanf(file, "%llu", &value); + if (ret != 1) { + if (ferror(file)) + ret = -errno; + else + ret = -EIO; + + kdbus_printf("--- error fscanf(): %d\n", ret); + fclose(file); + return ret; + } + + *mask = (uint64_t)value; + + fclose(file); + + return 0; +} + +int kdbus_sysfs_set_parameter_mask(const char *path, uint64_t mask) +{ + int ret; + FILE *file; + + file = fopen(path, "w"); + if (!file) { + ret = -errno; + kdbus_printf("--- error open(): %d (%m)\n", ret); + return ret; + } + + ret = fprintf(file, "%llu", (unsigned long long)mask); + if (ret <= 0) { + ret = -EIO; + kdbus_printf("--- error fprintf(): %d\n", ret); + } + + fclose(file); + + return ret > 0 ? 0 : ret; +} + +int kdbus_create_bus(int control_fd, const char *name, + uint64_t owner_meta, char **path) +{ + struct { + struct kdbus_cmd cmd; + + /* bloom size item */ + struct { + uint64_t size; + uint64_t type; + struct kdbus_bloom_parameter bloom; + } bp; + + /* owner metadata items */ + struct { + uint64_t size; + uint64_t type; + uint64_t flags; + } attach; + + /* name item */ + struct { + uint64_t size; + uint64_t type; + char str[64]; + } name; + } bus_make; + int ret; + + memset(&bus_make, 0, sizeof(bus_make)); + bus_make.bp.size = sizeof(bus_make.bp); + bus_make.bp.type = KDBUS_ITEM_BLOOM_PARAMETER; + bus_make.bp.bloom.size = 64; + bus_make.bp.bloom.n_hash = 1; + + snprintf(bus_make.name.str, sizeof(bus_make.name.str), + "%u-%s", getuid(), name); + + bus_make.attach.type = KDBUS_ITEM_ATTACH_FLAGS_SEND; + bus_make.attach.size = sizeof(bus_make.attach); + bus_make.attach.flags = owner_meta; + + bus_make.name.type = KDBUS_ITEM_MAKE_NAME; + bus_make.name.size = KDBUS_ITEM_HEADER_SIZE + + strlen(bus_make.name.str) + 1; + + bus_make.cmd.flags = KDBUS_MAKE_ACCESS_WORLD; + bus_make.cmd.size = sizeof(bus_make.cmd) + + bus_make.bp.size + + bus_make.attach.size + + bus_make.name.size; + + kdbus_printf("Creating bus with name >%s< on control fd %d ...\n", + name, control_fd); + + ret = kdbus_cmd_bus_make(control_fd, &bus_make.cmd); + if (ret < 0) { + kdbus_printf("--- error when making bus: %d (%m)\n", ret); + return ret; + } + + if (ret == 0 && path) + *path = strdup(bus_make.name.str); + + return ret; +} + +struct kdbus_conn * +kdbus_hello(const char *path, uint64_t flags, + const struct kdbus_item *item, size_t item_size) +{ + struct kdbus_cmd_free cmd_free = {}; + int fd, ret; + struct { + struct kdbus_cmd_hello hello; + + struct { + uint64_t size; + uint64_t type; + char str[16]; + } conn_name; + + uint8_t extra_items[item_size]; + } h; + struct kdbus_conn *conn; + + memset(&h, 0, sizeof(h)); + + if (item_size > 0) + memcpy(h.extra_items, item, item_size); + + kdbus_printf("-- opening bus connection %s\n", path); + fd = open(path, O_RDWR|O_CLOEXEC); + if (fd < 0) { + kdbus_printf("--- error %d (%m)\n", fd); + return NULL; + } + + h.hello.flags = flags | KDBUS_HELLO_ACCEPT_FD; + h.hello.attach_flags_send = _KDBUS_ATTACH_ALL; + h.hello.attach_flags_recv = _KDBUS_ATTACH_ALL; + h.conn_name.type = KDBUS_ITEM_CONN_DESCRIPTION; + strcpy(h.conn_name.str, "this-is-my-name"); + h.conn_name.size = KDBUS_ITEM_HEADER_SIZE + strlen(h.conn_name.str) + 1; + + h.hello.size = sizeof(h); + h.hello.pool_size = POOL_SIZE; + + ret = kdbus_cmd_hello(fd, (struct kdbus_cmd_hello *) &h.hello); + if (ret < 0) { + kdbus_printf("--- error when saying hello: %d (%m)\n", ret); + return NULL; + } + kdbus_printf("-- Our peer ID for %s: %llu -- bus uuid: '%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x'\n", + path, (unsigned long long)h.hello.id, + h.hello.id128[0], h.hello.id128[1], h.hello.id128[2], + h.hello.id128[3], h.hello.id128[4], h.hello.id128[5], + h.hello.id128[6], h.hello.id128[7], h.hello.id128[8], + h.hello.id128[9], h.hello.id128[10], h.hello.id128[11], + h.hello.id128[12], h.hello.id128[13], h.hello.id128[14], + h.hello.id128[15]); + + cmd_free.size = sizeof(cmd_free); + cmd_free.offset = h.hello.offset; + kdbus_cmd_free(fd, &cmd_free); + + conn = malloc(sizeof(*conn)); + if (!conn) { + kdbus_printf("unable to malloc()!?\n"); + return NULL; + } + + conn->buf = mmap(NULL, POOL_SIZE, PROT_READ, MAP_SHARED, fd, 0); + if (conn->buf == MAP_FAILED) { + free(conn); + close(fd); + kdbus_printf("--- error mmap (%m)\n"); + return NULL; + } + + conn->fd = fd; + conn->id = h.hello.id; + return conn; +} + +struct kdbus_conn * +kdbus_hello_registrar(const char *path, const char *name, + const struct kdbus_policy_access *access, + size_t num_access, uint64_t flags) +{ + struct kdbus_item *item, *items; + size_t i, size; + + size = KDBUS_ITEM_SIZE(strlen(name) + 1) + + num_access * KDBUS_ITEM_SIZE(sizeof(*access)); + + items = alloca(size); + + item = items; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + item->type = KDBUS_ITEM_NAME; + strcpy(item->str, name); + item = KDBUS_ITEM_NEXT(item); + + for (i = 0; i < num_access; i++) { + item->size = KDBUS_ITEM_HEADER_SIZE + + sizeof(struct kdbus_policy_access); + item->type = KDBUS_ITEM_POLICY_ACCESS; + + item->policy_access.type = access[i].type; + item->policy_access.access = access[i].access; + item->policy_access.id = access[i].id; + + item = KDBUS_ITEM_NEXT(item); + } + + return kdbus_hello(path, flags, items, size); +} + +struct kdbus_conn *kdbus_hello_activator(const char *path, const char *name, + const struct kdbus_policy_access *access, + size_t num_access) +{ + return kdbus_hello_registrar(path, name, access, num_access, + KDBUS_HELLO_ACTIVATOR); +} + +bool kdbus_item_in_message(struct kdbus_msg *msg, uint64_t type) +{ + const struct kdbus_item *item; + + KDBUS_ITEM_FOREACH(item, msg, items) + if (item->type == type) + return true; + + return false; +} + +int kdbus_bus_creator_info(struct kdbus_conn *conn, + uint64_t flags, + uint64_t *offset) +{ + struct kdbus_cmd_info *cmd; + size_t size = sizeof(*cmd); + int ret; + + cmd = alloca(size); + memset(cmd, 0, size); + cmd->size = size; + cmd->attach_flags = flags; + + ret = kdbus_cmd_bus_creator_info(conn->fd, cmd); + if (ret < 0) { + kdbus_printf("--- error when requesting info: %d (%m)\n", ret); + return ret; + } + + if (offset) + *offset = cmd->offset; + else + kdbus_free(conn, cmd->offset); + + return 0; +} + +int kdbus_conn_info(struct kdbus_conn *conn, uint64_t id, + const char *name, uint64_t flags, + uint64_t *offset) +{ + struct kdbus_cmd_info *cmd; + size_t size = sizeof(*cmd); + struct kdbus_info *info; + int ret; + + if (name) + size += KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + + cmd = alloca(size); + memset(cmd, 0, size); + cmd->size = size; + cmd->attach_flags = flags; + + if (name) { + cmd->items[0].size = KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + cmd->items[0].type = KDBUS_ITEM_NAME; + strcpy(cmd->items[0].str, name); + } else { + cmd->id = id; + } + + ret = kdbus_cmd_conn_info(conn->fd, cmd); + if (ret < 0) { + kdbus_printf("--- error when requesting info: %d (%m)\n", ret); + return ret; + } + + info = (struct kdbus_info *) (conn->buf + cmd->offset); + if (info->size != cmd->info_size) { + kdbus_printf("%s(): size mismatch: %d != %d\n", __func__, + (int) info->size, (int) cmd->info_size); + return -EIO; + } + + if (offset) + *offset = cmd->offset; + else + kdbus_free(conn, cmd->offset); + + return 0; +} + +void kdbus_conn_free(struct kdbus_conn *conn) +{ + if (!conn) + return; + + if (conn->buf) + munmap(conn->buf, POOL_SIZE); + + if (conn->fd >= 0) + close(conn->fd); + + free(conn); +} + +int sys_memfd_create(const char *name, __u64 size) +{ + int ret, fd; + + fd = syscall(__NR_memfd_create, name, MFD_ALLOW_SEALING); + if (fd < 0) + return fd; + + ret = ftruncate(fd, size); + if (ret < 0) { + close(fd); + return ret; + } + + return fd; +} + +int sys_memfd_seal_set(int fd) +{ + return fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK | + F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_SEAL); +} + +off_t sys_memfd_get_size(int fd, off_t *size) +{ + struct stat stat; + int ret; + + ret = fstat(fd, &stat); + if (ret < 0) { + kdbus_printf("stat() failed: %m\n"); + return ret; + } + + *size = stat.st_size; + return 0; +} + +static int __kdbus_msg_send(const struct kdbus_conn *conn, + const char *name, + uint64_t cookie, + uint64_t flags, + uint64_t timeout, + int64_t priority, + uint64_t dst_id, + uint64_t cmd_flags, + int cancel_fd) +{ + struct kdbus_cmd_send *cmd = NULL; + struct kdbus_msg *msg = NULL; + const char ref1[1024 * 128 + 3] = "0123456789_0"; + const char ref2[] = "0123456789_1"; + struct kdbus_item *item; + struct timespec now; + uint64_t size; + int memfd = -1; + int ret; + + size = sizeof(*msg) + 3 * KDBUS_ITEM_SIZE(sizeof(struct kdbus_vec)); + + if (dst_id == KDBUS_DST_ID_BROADCAST) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_bloom_filter)) + 64; + else { + memfd = sys_memfd_create("my-name-is-nice", 1024 * 1024); + if (memfd < 0) { + kdbus_printf("failed to create memfd: %m\n"); + return memfd; + } + + if (write(memfd, "kdbus memfd 1234567", 19) != 19) { + ret = -errno; + kdbus_printf("writing to memfd failed: %m\n"); + goto out; + } + + ret = sys_memfd_seal_set(memfd); + if (ret < 0) { + ret = -errno; + kdbus_printf("memfd sealing failed: %m\n"); + goto out; + } + + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_memfd)); + } + + if (name) + size += KDBUS_ITEM_SIZE(strlen(name) + 1); + + msg = malloc(size); + if (!msg) { + ret = -errno; + kdbus_printf("unable to malloc()!?\n"); + goto out; + } + + if (dst_id == KDBUS_DST_ID_BROADCAST) + flags |= KDBUS_MSG_SIGNAL; + + memset(msg, 0, size); + msg->flags = flags; + msg->priority = priority; + msg->size = size; + msg->src_id = conn->id; + msg->dst_id = name ? 0 : dst_id; + msg->cookie = cookie; + msg->payload_type = KDBUS_PAYLOAD_DBUS; + + if (timeout) { + ret = clock_gettime(CLOCK_MONOTONIC_COARSE, &now); + if (ret < 0) + goto out; + + msg->timeout_ns = now.tv_sec * 1000000000ULL + + now.tv_nsec + timeout; + } + + item = msg->items; + + if (name) { + item->type = KDBUS_ITEM_DST_NAME; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + strcpy(item->str, name); + item = KDBUS_ITEM_NEXT(item); + } + + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)&ref1; + item->vec.size = sizeof(ref1); + item = KDBUS_ITEM_NEXT(item); + + /* data padding for ref1 */ + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)NULL; + item->vec.size = KDBUS_ALIGN8(sizeof(ref1)) - sizeof(ref1); + item = KDBUS_ITEM_NEXT(item); + + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)&ref2; + item->vec.size = sizeof(ref2); + item = KDBUS_ITEM_NEXT(item); + + if (dst_id == KDBUS_DST_ID_BROADCAST) { + item->type = KDBUS_ITEM_BLOOM_FILTER; + item->size = KDBUS_ITEM_SIZE(sizeof(struct kdbus_bloom_filter)) + 64; + item->bloom_filter.generation = 0; + } else { + item->type = KDBUS_ITEM_PAYLOAD_MEMFD; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_memfd); + item->memfd.size = 16; + item->memfd.fd = memfd; + } + item = KDBUS_ITEM_NEXT(item); + + size = sizeof(*cmd); + if (cancel_fd != -1) + size += KDBUS_ITEM_SIZE(sizeof(cancel_fd)); + + cmd = malloc(size); + if (!cmd) { + ret = -errno; + kdbus_printf("unable to malloc()!?\n"); + goto out; + } + + cmd->size = size; + cmd->flags = cmd_flags; + cmd->msg_address = (uintptr_t)msg; + + item = cmd->items; + + if (cancel_fd != -1) { + item->type = KDBUS_ITEM_CANCEL_FD; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(cancel_fd); + item->fds[0] = cancel_fd; + item = KDBUS_ITEM_NEXT(item); + } + + ret = kdbus_cmd_send(conn->fd, cmd); + if (ret < 0) { + kdbus_printf("error sending message: %d (%m)\n", ret); + goto out; + } + + if (cmd_flags & KDBUS_SEND_SYNC_REPLY) { + struct kdbus_msg *reply; + + kdbus_printf("SYNC REPLY @offset %llu:\n", cmd->reply.offset); + reply = (struct kdbus_msg *)(conn->buf + cmd->reply.offset); + kdbus_msg_dump(conn, reply); + + kdbus_msg_free(reply); + + ret = kdbus_free(conn, cmd->reply.offset); + if (ret < 0) + goto out; + } + +out: + free(msg); + free(cmd); + + if (memfd >= 0) + close(memfd); + + return ret < 0 ? ret : 0; +} + +int kdbus_msg_send(const struct kdbus_conn *conn, const char *name, + uint64_t cookie, uint64_t flags, uint64_t timeout, + int64_t priority, uint64_t dst_id) +{ + return __kdbus_msg_send(conn, name, cookie, flags, timeout, priority, + dst_id, 0, -1); +} + +int kdbus_msg_send_sync(const struct kdbus_conn *conn, const char *name, + uint64_t cookie, uint64_t flags, uint64_t timeout, + int64_t priority, uint64_t dst_id, int cancel_fd) +{ + return __kdbus_msg_send(conn, name, cookie, flags, timeout, priority, + dst_id, KDBUS_SEND_SYNC_REPLY, cancel_fd); +} + +int kdbus_msg_send_reply(const struct kdbus_conn *conn, + uint64_t reply_cookie, + uint64_t dst_id) +{ + struct kdbus_cmd_send cmd = {}; + struct kdbus_msg *msg; + const char ref1[1024 * 128 + 3] = "0123456789_0"; + struct kdbus_item *item; + uint64_t size; + int ret; + + size = sizeof(struct kdbus_msg); + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_vec)); + + msg = malloc(size); + if (!msg) { + kdbus_printf("unable to malloc()!?\n"); + return -ENOMEM; + } + + memset(msg, 0, size); + msg->size = size; + msg->src_id = conn->id; + msg->dst_id = dst_id; + msg->cookie_reply = reply_cookie; + msg->payload_type = KDBUS_PAYLOAD_DBUS; + + item = msg->items; + + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)&ref1; + item->vec.size = sizeof(ref1); + item = KDBUS_ITEM_NEXT(item); + + cmd.size = sizeof(cmd); + cmd.msg_address = (uintptr_t)msg; + + ret = kdbus_cmd_send(conn->fd, &cmd); + if (ret < 0) + kdbus_printf("error sending message: %d (%m)\n", ret); + + free(msg); + + return ret; +} + +static char *msg_id(uint64_t id, char *buf) +{ + if (id == 0) + return "KERNEL"; + if (id == ~0ULL) + return "BROADCAST"; + sprintf(buf, "%llu", (unsigned long long)id); + return buf; +} + +int kdbus_msg_dump(const struct kdbus_conn *conn, const struct kdbus_msg *msg) +{ + const struct kdbus_item *item = msg->items; + char buf_src[32]; + char buf_dst[32]; + uint64_t timeout = 0; + uint64_t cookie_reply = 0; + int ret = 0; + + if (msg->flags & KDBUS_MSG_EXPECT_REPLY) + timeout = msg->timeout_ns; + else + cookie_reply = msg->cookie_reply; + + kdbus_printf("MESSAGE: %s (%llu bytes) flags=0x%08llx, %s → %s, " + "cookie=%llu, timeout=%llu cookie_reply=%llu priority=%lli\n", + enum_PAYLOAD(msg->payload_type), (unsigned long long)msg->size, + (unsigned long long)msg->flags, + msg_id(msg->src_id, buf_src), msg_id(msg->dst_id, buf_dst), + (unsigned long long)msg->cookie, (unsigned long long)timeout, + (unsigned long long)cookie_reply, (long long)msg->priority); + + KDBUS_ITEM_FOREACH(item, msg, items) { + if (item->size < KDBUS_ITEM_HEADER_SIZE) { + kdbus_printf(" +%s (%llu bytes) invalid data record\n", + enum_MSG(item->type), item->size); + ret = -EINVAL; + break; + } + + switch (item->type) { + case KDBUS_ITEM_PAYLOAD_OFF: { + char *s; + + if (item->vec.offset == ~0ULL) + s = "[\\0-bytes]"; + else + s = (char *)msg + item->vec.offset; + + kdbus_printf(" +%s (%llu bytes) off=%llu size=%llu '%s'\n", + enum_MSG(item->type), item->size, + (unsigned long long)item->vec.offset, + (unsigned long long)item->vec.size, s); + break; + } + + case KDBUS_ITEM_FDS: { + int i, n = (item->size - KDBUS_ITEM_HEADER_SIZE) / + sizeof(int); + + kdbus_printf(" +%s (%llu bytes, %d fds)\n", + enum_MSG(item->type), item->size, n); + + for (i = 0; i < n; i++) + kdbus_printf(" fd[%d] = %d\n", + i, item->fds[i]); + + break; + } + + case KDBUS_ITEM_PAYLOAD_MEMFD: { + char *buf; + off_t size; + + buf = mmap(NULL, item->memfd.size, PROT_READ, + MAP_PRIVATE, item->memfd.fd, 0); + if (buf == MAP_FAILED) { + kdbus_printf("mmap() fd=%i size=%llu failed: %m\n", + item->memfd.fd, item->memfd.size); + break; + } + + if (sys_memfd_get_size(item->memfd.fd, &size) < 0) { + kdbus_printf("KDBUS_CMD_MEMFD_SIZE_GET failed: %m\n"); + break; + } + + kdbus_printf(" +%s (%llu bytes) fd=%i size=%llu filesize=%llu '%s'\n", + enum_MSG(item->type), item->size, item->memfd.fd, + (unsigned long long)item->memfd.size, + (unsigned long long)size, buf); + munmap(buf, item->memfd.size); + break; + } + + case KDBUS_ITEM_CREDS: + kdbus_printf(" +%s (%llu bytes) uid=%lld, euid=%lld, suid=%lld, fsuid=%lld, " + "gid=%lld, egid=%lld, sgid=%lld, fsgid=%lld\n", + enum_MSG(item->type), item->size, + item->creds.uid, item->creds.euid, + item->creds.suid, item->creds.fsuid, + item->creds.gid, item->creds.egid, + item->creds.sgid, item->creds.fsgid); + break; + + case KDBUS_ITEM_PIDS: + kdbus_printf(" +%s (%llu bytes) pid=%lld, tid=%lld, ppid=%lld\n", + enum_MSG(item->type), item->size, + item->pids.pid, item->pids.tid, + item->pids.ppid); + break; + + case KDBUS_ITEM_AUXGROUPS: { + int i, n; + + kdbus_printf(" +%s (%llu bytes)\n", + enum_MSG(item->type), item->size); + n = (item->size - KDBUS_ITEM_HEADER_SIZE) / + sizeof(uint64_t); + + for (i = 0; i < n; i++) + kdbus_printf(" gid[%d] = %lld\n", + i, item->data64[i]); + break; + } + + case KDBUS_ITEM_NAME: + case KDBUS_ITEM_PID_COMM: + case KDBUS_ITEM_TID_COMM: + case KDBUS_ITEM_EXE: + case KDBUS_ITEM_CGROUP: + case KDBUS_ITEM_SECLABEL: + case KDBUS_ITEM_DST_NAME: + case KDBUS_ITEM_CONN_DESCRIPTION: + kdbus_printf(" +%s (%llu bytes) '%s' (%zu)\n", + enum_MSG(item->type), item->size, + item->str, strlen(item->str)); + break; + + case KDBUS_ITEM_OWNED_NAME: { + kdbus_printf(" +%s (%llu bytes) '%s' (%zu) flags=0x%08llx\n", + enum_MSG(item->type), item->size, + item->name.name, strlen(item->name.name), + item->name.flags); + break; + } + + case KDBUS_ITEM_CMDLINE: { + size_t size = item->size - KDBUS_ITEM_HEADER_SIZE; + const char *str = item->str; + int count = 0; + + kdbus_printf(" +%s (%llu bytes) ", + enum_MSG(item->type), item->size); + while (size) { + kdbus_printf("'%s' ", str); + size -= strlen(str) + 1; + str += strlen(str) + 1; + count++; + } + + kdbus_printf("(%d string%s)\n", + count, (count == 1) ? "" : "s"); + break; + } + + case KDBUS_ITEM_AUDIT: + kdbus_printf(" +%s (%llu bytes) loginuid=%u sessionid=%u\n", + enum_MSG(item->type), item->size, + item->audit.loginuid, item->audit.sessionid); + break; + + case KDBUS_ITEM_CAPS: { + const uint32_t *cap; + int n, i; + + kdbus_printf(" +%s (%llu bytes) len=%llu bytes, last_cap %d\n", + enum_MSG(item->type), item->size, + (unsigned long long)item->size - + KDBUS_ITEM_HEADER_SIZE, + (int) item->caps.last_cap); + + cap = item->caps.caps; + n = (item->size - offsetof(struct kdbus_item, caps.caps)) + / 4 / sizeof(uint32_t); + + kdbus_printf(" CapInh="); + for (i = 0; i < n; i++) + kdbus_printf("%08x", cap[(0 * n) + (n - i - 1)]); + + kdbus_printf(" CapPrm="); + for (i = 0; i < n; i++) + kdbus_printf("%08x", cap[(1 * n) + (n - i - 1)]); + + kdbus_printf(" CapEff="); + for (i = 0; i < n; i++) + kdbus_printf("%08x", cap[(2 * n) + (n - i - 1)]); + + kdbus_printf(" CapBnd="); + for (i = 0; i < n; i++) + kdbus_printf("%08x", cap[(3 * n) + (n - i - 1)]); + kdbus_printf("\n"); + break; + } + + case KDBUS_ITEM_TIMESTAMP: + kdbus_printf(" +%s (%llu bytes) seq=%llu realtime=%lluns monotonic=%lluns\n", + enum_MSG(item->type), item->size, + (unsigned long long)item->timestamp.seqnum, + (unsigned long long)item->timestamp.realtime_ns, + (unsigned long long)item->timestamp.monotonic_ns); + break; + + case KDBUS_ITEM_REPLY_TIMEOUT: + kdbus_printf(" +%s (%llu bytes) cookie=%llu\n", + enum_MSG(item->type), item->size, + msg->cookie_reply); + break; + + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_REMOVE: + case KDBUS_ITEM_NAME_CHANGE: + kdbus_printf(" +%s (%llu bytes) '%s', old id=%lld, now id=%lld, old_flags=0x%llx new_flags=0x%llx\n", + enum_MSG(item->type), + (unsigned long long) item->size, + item->name_change.name, + item->name_change.old_id.id, + item->name_change.new_id.id, + item->name_change.old_id.flags, + item->name_change.new_id.flags); + break; + + case KDBUS_ITEM_ID_ADD: + case KDBUS_ITEM_ID_REMOVE: + kdbus_printf(" +%s (%llu bytes) id=%llu flags=%llu\n", + enum_MSG(item->type), + (unsigned long long) item->size, + (unsigned long long) item->id_change.id, + (unsigned long long) item->id_change.flags); + break; + + default: + kdbus_printf(" +%s (%llu bytes)\n", + enum_MSG(item->type), item->size); + break; + } + } + + if ((char *)item - ((char *)msg + msg->size) >= 8) { + kdbus_printf("invalid padding at end of message\n"); + ret = -EINVAL; + } + + kdbus_printf("\n"); + + return ret; +} + +void kdbus_msg_free(struct kdbus_msg *msg) +{ + const struct kdbus_item *item; + int nfds, i; + + if (!msg) + return; + + KDBUS_ITEM_FOREACH(item, msg, items) { + switch (item->type) { + /* close all memfds */ + case KDBUS_ITEM_PAYLOAD_MEMFD: + close(item->memfd.fd); + break; + case KDBUS_ITEM_FDS: + nfds = (item->size - KDBUS_ITEM_HEADER_SIZE) / + sizeof(int); + + for (i = 0; i < nfds; i++) + close(item->fds[i]); + + break; + } + } +} + +int kdbus_msg_recv(struct kdbus_conn *conn, + struct kdbus_msg **msg_out, + uint64_t *offset) +{ + struct kdbus_cmd_recv recv = { .size = sizeof(recv) }; + struct kdbus_msg *msg; + int ret; + + ret = kdbus_cmd_recv(conn->fd, &recv); + if (ret < 0) + return ret; + + msg = (struct kdbus_msg *)(conn->buf + recv.msg.offset); + ret = kdbus_msg_dump(conn, msg); + if (ret < 0) { + kdbus_msg_free(msg); + return ret; + } + + if (msg_out) { + *msg_out = msg; + + if (offset) + *offset = recv.msg.offset; + } else { + kdbus_msg_free(msg); + + ret = kdbus_free(conn, recv.msg.offset); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * Returns: 0 on success, negative errno on failure. + * + * We must return -ETIMEDOUT, -ECONNREST, -EAGAIN and other errors. + * We must return the result of kdbus_msg_recv() + */ +int kdbus_msg_recv_poll(struct kdbus_conn *conn, + int timeout_ms, + struct kdbus_msg **msg_out, + uint64_t *offset) +{ + int ret; + + do { + struct timeval before, after, diff; + struct pollfd fd; + + fd.fd = conn->fd; + fd.events = POLLIN | POLLPRI | POLLHUP; + fd.revents = 0; + + gettimeofday(&before, NULL); + ret = poll(&fd, 1, timeout_ms); + gettimeofday(&after, NULL); + + if (ret == 0) { + ret = -ETIMEDOUT; + break; + } + + if (ret > 0) { + if (fd.revents & POLLIN) + ret = kdbus_msg_recv(conn, msg_out, offset); + + if (fd.revents & (POLLHUP | POLLERR)) + ret = -ECONNRESET; + } + + if (ret == 0 || ret != -EAGAIN) + break; + + timersub(&after, &before, &diff); + timeout_ms -= diff.tv_sec * 1000UL + + diff.tv_usec / 1000UL; + } while (timeout_ms > 0); + + return ret; +} + +int kdbus_free(const struct kdbus_conn *conn, uint64_t offset) +{ + struct kdbus_cmd_free cmd_free = {}; + int ret; + + cmd_free.size = sizeof(cmd_free); + cmd_free.offset = offset; + cmd_free.flags = 0; + + ret = kdbus_cmd_free(conn->fd, &cmd_free); + if (ret < 0) { + kdbus_printf("KDBUS_CMD_FREE failed: %d (%m)\n", ret); + return ret; + } + + return 0; +} + +int kdbus_name_acquire(struct kdbus_conn *conn, + const char *name, uint64_t *flags) +{ + struct kdbus_cmd *cmd_name; + size_t name_len = strlen(name) + 1; + uint64_t size = sizeof(*cmd_name) + KDBUS_ITEM_SIZE(name_len); + struct kdbus_item *item; + int ret; + + cmd_name = alloca(size); + + memset(cmd_name, 0, size); + + item = cmd_name->items; + item->size = KDBUS_ITEM_HEADER_SIZE + name_len; + item->type = KDBUS_ITEM_NAME; + strcpy(item->str, name); + + cmd_name->size = size; + if (flags) + cmd_name->flags = *flags; + + ret = kdbus_cmd_name_acquire(conn->fd, cmd_name); + if (ret < 0) { + kdbus_printf("error aquiring name: %s\n", strerror(-ret)); + return ret; + } + + kdbus_printf("%s(): flags after call: 0x%llx\n", __func__, + cmd_name->return_flags); + + if (flags) + *flags = cmd_name->return_flags; + + return 0; +} + +int kdbus_name_release(struct kdbus_conn *conn, const char *name) +{ + struct kdbus_cmd *cmd_name; + size_t name_len = strlen(name) + 1; + uint64_t size = sizeof(*cmd_name) + KDBUS_ITEM_SIZE(name_len); + struct kdbus_item *item; + int ret; + + cmd_name = alloca(size); + + memset(cmd_name, 0, size); + + item = cmd_name->items; + item->size = KDBUS_ITEM_HEADER_SIZE + name_len; + item->type = KDBUS_ITEM_NAME; + strcpy(item->str, name); + + cmd_name->size = size; + + kdbus_printf("conn %lld giving up name '%s'\n", + (unsigned long long) conn->id, name); + + ret = kdbus_cmd_name_release(conn->fd, cmd_name); + if (ret < 0) { + kdbus_printf("error releasing name: %s\n", strerror(-ret)); + return ret; + } + + return 0; +} + +int kdbus_list(struct kdbus_conn *conn, uint64_t flags) +{ + struct kdbus_cmd_list cmd_list = {}; + struct kdbus_info *list, *name; + int ret; + + cmd_list.size = sizeof(cmd_list); + cmd_list.flags = flags; + + ret = kdbus_cmd_list(conn->fd, &cmd_list); + if (ret < 0) { + kdbus_printf("error listing names: %d (%m)\n", ret); + return ret; + } + + kdbus_printf("REGISTRY:\n"); + list = (struct kdbus_info *)(conn->buf + cmd_list.offset); + + KDBUS_FOREACH(name, list, cmd_list.list_size) { + uint64_t flags = 0; + struct kdbus_item *item; + const char *n = "MISSING-NAME"; + + if (name->size == sizeof(struct kdbus_cmd)) + continue; + + KDBUS_ITEM_FOREACH(item, name, items) + if (item->type == KDBUS_ITEM_OWNED_NAME) { + n = item->name.name; + flags = item->name.flags; + + kdbus_printf("%8llu flags=0x%08llx conn=0x%08llx '%s'\n", + name->id, + (unsigned long long) flags, + name->flags, n); + } + } + kdbus_printf("\n"); + + ret = kdbus_free(conn, cmd_list.offset); + + return ret; +} + +int kdbus_conn_update_attach_flags(struct kdbus_conn *conn, + uint64_t attach_flags_send, + uint64_t attach_flags_recv) +{ + int ret; + size_t size; + struct kdbus_cmd *update; + struct kdbus_item *item; + + size = sizeof(struct kdbus_cmd); + size += KDBUS_ITEM_SIZE(sizeof(uint64_t)) * 2; + + update = malloc(size); + if (!update) { + kdbus_printf("error malloc: %m\n"); + return -ENOMEM; + } + + memset(update, 0, size); + update->size = size; + + item = update->items; + + item->type = KDBUS_ITEM_ATTACH_FLAGS_SEND; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(uint64_t); + item->data64[0] = attach_flags_send; + item = KDBUS_ITEM_NEXT(item); + + item->type = KDBUS_ITEM_ATTACH_FLAGS_RECV; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(uint64_t); + item->data64[0] = attach_flags_recv; + item = KDBUS_ITEM_NEXT(item); + + ret = kdbus_cmd_update(conn->fd, update); + if (ret < 0) + kdbus_printf("error conn update: %d (%m)\n", ret); + + free(update); + + return ret; +} + +int kdbus_conn_update_policy(struct kdbus_conn *conn, const char *name, + const struct kdbus_policy_access *access, + size_t num_access) +{ + struct kdbus_cmd *update; + struct kdbus_item *item; + size_t i, size; + int ret; + + size = sizeof(struct kdbus_cmd); + size += KDBUS_ITEM_SIZE(strlen(name) + 1); + size += num_access * KDBUS_ITEM_SIZE(sizeof(struct kdbus_policy_access)); + + update = malloc(size); + if (!update) { + kdbus_printf("error malloc: %m\n"); + return -ENOMEM; + } + + memset(update, 0, size); + update->size = size; + + item = update->items; + + item->type = KDBUS_ITEM_NAME; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + strcpy(item->str, name); + item = KDBUS_ITEM_NEXT(item); + + for (i = 0; i < num_access; i++) { + item->size = KDBUS_ITEM_HEADER_SIZE + + sizeof(struct kdbus_policy_access); + item->type = KDBUS_ITEM_POLICY_ACCESS; + + item->policy_access.type = access[i].type; + item->policy_access.access = access[i].access; + item->policy_access.id = access[i].id; + + item = KDBUS_ITEM_NEXT(item); + } + + ret = kdbus_cmd_update(conn->fd, update); + if (ret < 0) + kdbus_printf("error conn update: %d (%m)\n", ret); + + free(update); + + return ret; +} + +int kdbus_add_match_id(struct kdbus_conn *conn, uint64_t cookie, + uint64_t type, uint64_t id) +{ + struct { + struct kdbus_cmd_match cmd; + struct { + uint64_t size; + uint64_t type; + struct kdbus_notify_id_change chg; + } item; + } buf; + int ret; + + memset(&buf, 0, sizeof(buf)); + + buf.cmd.size = sizeof(buf); + buf.cmd.cookie = cookie; + buf.item.size = sizeof(buf.item); + buf.item.type = type; + buf.item.chg.id = id; + + ret = kdbus_cmd_match_add(conn->fd, &buf.cmd); + if (ret < 0) + kdbus_printf("--- error adding conn match: %d (%m)\n", ret); + + return ret; +} + +int kdbus_add_match_empty(struct kdbus_conn *conn) +{ + struct { + struct kdbus_cmd_match cmd; + struct kdbus_item item; + } buf; + int ret; + + memset(&buf, 0, sizeof(buf)); + + buf.item.size = sizeof(uint64_t) * 3; + buf.item.type = KDBUS_ITEM_ID; + buf.item.id = KDBUS_MATCH_ID_ANY; + + buf.cmd.size = sizeof(buf.cmd) + buf.item.size; + + ret = kdbus_cmd_match_add(conn->fd, &buf.cmd); + if (ret < 0) + kdbus_printf("--- error adding conn match: %d (%m)\n", ret); + + return ret; +} + +static int all_ids_are_mapped(const char *path) +{ + int ret; + FILE *file; + uint32_t inside_id, length; + + file = fopen(path, "r"); + if (!file) { + ret = -errno; + kdbus_printf("error fopen() %s: %d (%m)\n", + path, ret); + return ret; + } + + ret = fscanf(file, "%u\t%*u\t%u", &inside_id, &length); + if (ret != 2) { + if (ferror(file)) + ret = -errno; + else + ret = -EIO; + + kdbus_printf("--- error fscanf(): %d\n", ret); + fclose(file); + return ret; + } + + fclose(file); + + /* + * If length is 4294967295 which means the invalid uid + * (uid_t) -1 then we are able to map all uid/gids + */ + if (inside_id == 0 && length == (uid_t) -1) + return 1; + + return 0; +} + +int all_uids_gids_are_mapped(void) +{ + int ret; + + ret = all_ids_are_mapped("/proc/self/uid_map"); + if (ret <= 0) { + kdbus_printf("--- error not all uids are mapped\n"); + return 0; + } + + ret = all_ids_are_mapped("/proc/self/gid_map"); + if (ret <= 0) { + kdbus_printf("--- error not all gids are mapped\n"); + return 0; + } + + return 1; +} + +int drop_privileges(uid_t uid, gid_t gid) +{ + int ret; + + ret = setgroups(0, NULL); + if (ret < 0) { + ret = -errno; + kdbus_printf("error setgroups: %d (%m)\n", ret); + return ret; + } + + ret = setresgid(gid, gid, gid); + if (ret < 0) { + ret = -errno; + kdbus_printf("error setresgid: %d (%m)\n", ret); + return ret; + } + + ret = setresuid(uid, uid, uid); + if (ret < 0) { + ret = -errno; + kdbus_printf("error setresuid: %d (%m)\n", ret); + return ret; + } + + return ret; +} + +uint64_t now(clockid_t clock) +{ + struct timespec spec; + + clock_gettime(clock, &spec); + return spec.tv_sec * 1000ULL * 1000ULL * 1000ULL + spec.tv_nsec; +} + +char *unique_name(const char *prefix) +{ + unsigned int i; + uint64_t u_now; + char n[17]; + char *str; + int r; + + /* + * This returns a random string which is guaranteed to be + * globally unique across all calls to unique_name(). We + * compose the string as: + * --