diff -Nur linux-4.2/Documentation/Makefile linux-4.2-pck/Documentation/Makefile --- linux-4.2/Documentation/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/Documentation/Makefile 2015-09-08 00:48:22.208364829 -0300 @@ -1,4 +1,4 @@ subdir-y := accounting auxdisplay blackfin connector \ - filesystems filesystems ia64 laptops mic misc-devices \ + filesystems filesystems ia64 kdbus laptops mic misc-devices \ networking pcmcia prctl ptp spi timers vDSO video4linux \ watchdog diff -Nur linux-4.2/Documentation/ioctl/ioctl-number.txt linux-4.2-pck/Documentation/ioctl/ioctl-number.txt --- linux-4.2/Documentation/ioctl/ioctl-number.txt 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/Documentation/ioctl/ioctl-number.txt 2015-09-08 00:48:22.208364829 -0300 @@ -292,6 +292,7 @@ 0x92 00-0F drivers/usb/mon/mon_bin.c 0x93 60-7F linux/auto_fs.h 0x94 all fs/btrfs/ioctl.h +0x95 all uapi/linux/kdbus.h kdbus IPC driver 0x97 00-7F fs/ceph/ioctl.h Ceph file system 0x99 00-0F 537-Addinboard driver diff -Nur linux-4.2/Documentation/kdbus/.gitignore linux-4.2-pck/Documentation/kdbus/.gitignore --- linux-4.2/Documentation/kdbus/.gitignore 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/.gitignore 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,2 @@ +*.7 +*.html diff -Nur linux-4.2/Documentation/kdbus/Makefile linux-4.2-pck/Documentation/kdbus/Makefile --- linux-4.2/Documentation/kdbus/Makefile 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/Makefile 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,44 @@ +DOCS := \ + kdbus.xml \ + kdbus.bus.xml \ + kdbus.connection.xml \ + kdbus.endpoint.xml \ + kdbus.fs.xml \ + kdbus.item.xml \ + kdbus.match.xml \ + kdbus.message.xml \ + kdbus.name.xml \ + kdbus.policy.xml \ + kdbus.pool.xml + +XMLFILES := $(addprefix $(obj)/,$(DOCS)) +MANFILES := $(patsubst %.xml, %.7, $(XMLFILES)) +HTMLFILES := $(patsubst %.xml, %.html, $(XMLFILES)) + +XMLTO_ARGS := -m $(srctree)/$(src)/stylesheet.xsl --skip-validation + +quiet_cmd_db2man = MAN $@ + cmd_db2man = xmlto man $(XMLTO_ARGS) -o $(obj) $< +%.7: %.xml + @(which xmlto > /dev/null 2>&1) || \ + (echo "*** You need to install xmlto ***"; \ + exit 1) + $(call cmd,db2man) + +quiet_cmd_db2html = HTML $@ + cmd_db2html = xmlto html-nochunks $(XMLTO_ARGS) -o $(obj) $< +%.html: %.xml + @(which xmlto > /dev/null 2>&1) || \ + (echo "*** You need to install xmlto ***"; \ + exit 1) + $(call cmd,db2html) + +mandocs: $(MANFILES) + +htmldocs: $(HTMLFILES) + +clean-files := $(MANFILES) $(HTMLFILES) + +# we don't support other %docs targets right now +%docs: + @true diff -Nur linux-4.2/Documentation/kdbus/kdbus.bus.xml linux-4.2-pck/Documentation/kdbus/kdbus.bus.xml --- linux-4.2/Documentation/kdbus/kdbus.bus.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.bus.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,344 @@ + + + + + + + kdbus.bus + kdbus.bus + + + + kdbus.bus + 7 + + + + kdbus.bus + kdbus bus + + + + Description + + + A bus is a resource that is shared between connections in order to + transmit messages (see + + kdbus.message + 7 + ). + Each bus is independent, and operations on the bus will not have any + effect on other buses. A bus is a management entity that controls the + addresses of its connections, their policies and message transactions + performed via this bus. + + + Each bus is bound to the mount instance it was created on. It has a + custom name that is unique across all buses of a domain. In + + kdbus.fs + 7 + + a bus is presented as a directory. No operations can be performed on + the bus itself; instead you need to perform the operations on an endpoint + associated with the bus. Endpoints are accessible as files underneath the + bus directory. A default endpoint called bus is + provided on each bus. + + + Bus names may be chosen freely except for one restriction: the name must + be prefixed with the numeric effective UID of the creator and a dash. This + is required to avoid namespace clashes between different users. When + creating a bus, the name that is passed in must be properly formatted, or + the kernel will refuse creation of the bus. Example: + 1047-foobar is an acceptable name for a bus + registered by a user with UID 1047. However, + 1024-foobar is not, and neither is + foobar. The UID must be provided in the + user-namespace of the bus owner. + + + To create a new bus, you need to open the control file of a domain and + employ the KDBUS_CMD_BUS_MAKE ioctl. The control + file descriptor that was used to issue + KDBUS_CMD_BUS_MAKE must not previously have been + used for any other control-ioctl and must be kept open for the entire + life-time of the created bus. Closing it will immediately cleanup the + entire bus and all its associated resources and endpoints. Every control + file descriptor can only be used to create a single new bus; from that + point on, it is not used for any further communication until the final + + close + 2 + + . + + + Each bus will generate a random, 128-bit UUID upon creation. This UUID + will be returned to creators of connections through + kdbus_cmd_hello.id128 and can be used to uniquely + identify buses, even across different machines or containers. The UUID + will have its variant bits set to DCE, and denote + version 4 (random). For more details on UUIDs, see + the Wikipedia article on UUIDs. + + + + + + Creating buses + + To create a new bus, the KDBUS_CMD_BUS_MAKE + command is used. It takes a struct kdbus_cmd argument. + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + The flags for creation. + + + KDBUS_MAKE_ACCESS_GROUP + + Make the bus file group-accessible. + + + + + KDBUS_MAKE_ACCESS_WORLD + + Make the bus file world-accessible. + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + The following items (see + + kdbus.item + 7 + ) + are expected for KDBUS_CMD_BUS_MAKE. + + + + KDBUS_ITEM_MAKE_NAME + + + Contains a null-terminated string that identifies the + bus. The name must be unique across the kdbus domain and + must start with the effective UID of the caller, followed by + a '-' (dash). This item is mandatory. + + + + + + KDBUS_ITEM_BLOOM_PARAMETER + + + Bus-wide bloom parameters passed in a + struct kdbus_bloom_parameter. These settings are + copied back to new connections verbatim. This item is + mandatory. See + + kdbus.item + 7 + + for a more detailed description of this item. + + + + + + KDBUS_ITEM_ATTACH_FLAGS_SEND + + + An optional item that contains a set of attach flags that are + returned to connections when they query the bus creator + metadata. If not set, no metadata is returned. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_BUS_MAKE</constant> may fail with the following + errors + + + + + EBADMSG + + A mandatory item is missing. + + + + + EINVAL + + The flags supplied in the struct kdbus_cmd + are invalid or the supplied name does not start with the current + UID and a '-' (dash). + + + + + EEXIST + + A bus of that name already exists. + + + + + ESHUTDOWN + + The kdbus mount instance for the bus was already shut down. + + + + + EMFILE + + The maximum number of buses for the current user is exhausted. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.connection.xml linux-4.2-pck/Documentation/kdbus/kdbus.connection.xml --- linux-4.2/Documentation/kdbus/kdbus.connection.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.connection.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,1244 @@ + + + + + + + kdbus.connection + kdbus.connection + + + + kdbus.connection + 7 + + + + kdbus.connection + kdbus connection + + + + Description + + + Connections are identified by their connection ID, + internally implemented as a uint64_t counter. + The IDs of every newly created bus start at 1, and + every new connection will increment the counter by 1. + The IDs are not reused. + + + In higher level tools, the user visible representation of a connection is + defined by the D-Bus protocol specification as + ":1.<ID>". + + + Messages with a specific uint64_t destination ID are + directly delivered to the connection with the corresponding ID. Signal + messages (see + + kdbus.message + 7 + ) + may be addressed to the special destination ID + KDBUS_DST_ID_BROADCAST (~0ULL) and will then + potentially be delivered to all currently active connections on the bus. + However, in order to receive any signal messages, clients must subscribe + to them by installing a match (see + + kdbus.match + 7 + ). + + + Messages synthesized and sent directly by the kernel will carry the + special source ID KDBUS_SRC_ID_KERNEL (0). + + + In addition to the unique uint64_t connection ID, + established connections can request the ownership of + well-known names, under which they can be found and + addressed by other bus clients. A well-known name is associated with one + and only one connection at a time. See + + kdbus.name + 7 + + on name acquisition, the name registry, and the validity of names. + + + Messages can specify the special destination ID + KDBUS_DST_ID_NAME (0) and carry a well-known name + in the message data. Such a message is delivered to the destination + connection which owns that well-known name. + + + | src: 22 | | | + | | | | dst: 25 | | | + | | | | | | | + | | | | | | | + | | | +---------------------------+ | | + | | | | | + | | | <--------------------------------------+ | | + | +---------------+ | | | + | | | | + | +---------------+ +---------------------------+ | | | + | | Connection | | Message | -----+ | | + | | :1.25 | --> | src: 25 | | | + | | | | dst: 0xffffffffffffffff | -------------+ | | + | | | | (KDBUS_DST_ID_BROADCAST) | | | | + | | | | | ---------+ | | | + | | | +---------------------------+ | | | | + | | | | | | | + | | | <--------------------------------------------------+ | + | +---------------+ | | | + | | | | + | +---------------+ +---------------------------+ | | | + | | Connection | | Message | --+ | | | + | | :1.55 | --> | src: 55 | | | | | + | | | | dst: 0 / org.foo.bar | | | | | + | | | | | | | | | + | | | | | | | | | + | | | +---------------------------+ | | | | + | | | | | | | + | | | <------------------------------------------+ | | + | +---------------+ | | | + | | | | + | +---------------+ | | | + | | Connection | | | | + | | :1.81 | | | | + | | org.foo.bar | | | | + | | | | | | + | | | | | | + | | | <-----------------------------------+ | | + | | | | | + | | | <----------------------------------------------+ | + | +---------------+ | + +-------------------------------------------------------------------------+ + ]]> + + + + Privileged connections + + A connection is considered privileged if the user + it was created by is the same that created the bus, or if the creating + task had CAP_IPC_OWNER set when it called + KDBUS_CMD_HELLO (see below). + + + Privileged connections have permission to employ certain restricted + functions and commands, which are explained below and in other kdbus + man-pages. + + + + + Activator and policy holder connection + + An activator connection is a placeholder for a + well-known name. Messages sent to such a connection + can be used to start an implementer connection, which will then get all + the messages from the activator copied over. An activator connection + cannot be used to send any message. + + + A policy holder connection only installs a policy + for one or more names. These policy entries are kept active as long as + the connection is alive, and are removed once it terminates. Such a + policy connection type can be used to deploy restrictions for names that + are not yet active on the bus. A policy holder connection cannot be used + to send any message. + + + The creation of activator or policy holder connections is restricted to + privileged users on the bus (see above). + + + + + Monitor connections + + Monitors are eavesdropping connections that receive all the traffic on the + bus, but is invisible to other connections. Such connections have all + properties of any other, regular connection, except for the following + details: + + + + + They will get every message sent over the bus, both unicasts and + broadcasts. + + + + Installing matches for signal messages is neither necessary + nor allowed. + + + + They cannot send messages or be directly addressed as receiver. + + + + They cannot own well-known names. Therefore, they also can't operate as + activators. + + + + Their creation and destruction will not cause + KDBUS_ITEM_ID_{ADD,REMOVE} (see + + kdbus.item + 7 + ). + + + + They are not listed with their unique name in name registry dumps + (see KDBUS_CMD_NAME_LIST in + + kdbus.name + 7 + ), so other connections cannot detect the presence of + a monitor. + + + + The creation of monitor connections is restricted to privileged users on + the bus (see above). + + + + + Creating connections + + A connection to a bus is created by opening an endpoint file (see + + kdbus.endpoint + 7 + ) + of a bus and becoming an active client with the + KDBUS_CMD_HELLO ioctl. Every connection has a unique + identifier on the bus and can address messages to every other connection + on the same bus by using the peer's connection ID as the destination. + + + The KDBUS_CMD_HELLO ioctl takes a struct + kdbus_cmd_hello as argument. + + + +struct kdbus_cmd_hello { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 attach_flags_send; + __u64 attach_flags_recv; + __u64 bus_flags; + __u64 id; + __u64 pool_size; + __u64 offset; + __u8 id128[16]; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Flags to apply to this connection + + + KDBUS_HELLO_ACCEPT_FD + + + When this flag is set, the connection can be sent file + descriptors as message payload of unicast messages. If it's + not set, an attempt to send file descriptors will result in + -ECOMM on the sender's side. + + + + + + KDBUS_HELLO_ACTIVATOR + + + Make this connection an activator (see above). With this bit + set, an item of type KDBUS_ITEM_NAME has + to be attached. This item describes the well-known name this + connection should be an activator for. + A connection can not be an activator and a policy holder at + the same time time, so this bit is not allowed together with + KDBUS_HELLO_POLICY_HOLDER. + + + + + + KDBUS_HELLO_POLICY_HOLDER + + + Make this connection a policy holder (see above). With this + bit set, an item of type KDBUS_ITEM_NAME + has to be attached. This item describes the well-known name + this connection should hold a policy for. + A connection can not be an activator and a policy holder at + the same time time, so this bit is not allowed together with + KDBUS_HELLO_ACTIVATOR. + + + + + + KDBUS_HELLO_MONITOR + + + Make this connection a monitor connection (see above). + + + This flag can only be set by privileged bus connections. See + below for more information. + A connection can not be monitor and an activator or a policy + holder at the same time time, so this bit is not allowed + together with KDBUS_HELLO_ACTIVATOR or + KDBUS_HELLO_POLICY_HOLDER. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + attach_flags_send + + Set the bits for metadata this connection permits to be sent to the + receiving peer. Only metadata items that are both allowed to be sent + by the sender and that are requested by the receiver will be attached + to the message. + + + + + attach_flags_recv + + Request the attachment of metadata for each message received by this + connection. See + + kdbus + 7 + + for information about metadata, and + + kdbus.item + 7 + + regarding items in general. + + + + + bus_flags + + Upon successful completion of the ioctl, this member will contain the + flags of the bus it connected to. + + + + + id + + Upon successful completion of the command, this member will contain + the numerical ID of the new connection. + + + + + pool_size + + The size of the communication pool, in bytes. The pool can be + accessed by calling + + mmap + 2 + + on the file descriptor that was used to issue the + KDBUS_CMD_HELLO ioctl. + The pool size of a connection must be greater than + 0 and a multiple of + PAGE_SIZE. See + + kdbus.pool + 7 + + for more information. + + + + + offset + + The kernel will return the offset in the pool where returned details + will be stored. See below. + + + + + id128 + + Upon successful completion of the ioctl, this member will contain the + 128-bit UUID of the connected bus. + + + + + items + + + Variable list of items containing optional additional information. + The following items are currently expected/valid: + + + + KDBUS_ITEM_CONN_DESCRIPTION + + + Contains a string that describes this connection, so it can + be identified later. + + + + + + KDBUS_ITEM_NAME + KDBUS_ITEM_POLICY_ACCESS + + + For activators and policy holders only, combinations of + these two items describe policy access entries. See + + kdbus.policy + 7 + + for further details. + + + + + + KDBUS_ITEM_CREDS + KDBUS_ITEM_PIDS + KDBUS_ITEM_SECLABEL + + + Privileged bus users may submit these types in order to + create connections with faked credentials. This information + will be returned when peer information is queried by + KDBUS_CMD_CONN_INFO. See below for more + information on retrieving information on connections. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + At the offset returned in the offset field of + struct kdbus_cmd_hello, the kernel will store items + of the following types: + + + + + KDBUS_ITEM_BLOOM_PARAMETER + + + Bloom filter parameter as defined by the bus creator. + + + + + + + The offset in the pool has to be freed with the + KDBUS_CMD_FREE ioctl. See + + kdbus.pool + 7 + + for further information. + + + + + Retrieving information on a connection + + The KDBUS_CMD_CONN_INFO ioctl can be used to + retrieve credentials and properties of the initial creator of a + connection. This ioctl uses the following struct. + + + +struct kdbus_cmd_info { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 id; + __u64 attach_flags; + __u64 offset; + __u64 info_size; + struct kdbus_item items[0]; +}; + + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Currently, no flags are supported. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will return 0, + and the flags field is set to + 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + id + + The numerical ID of the connection for which information is to be + retrieved. If set to a non-zero value, the + KDBUS_ITEM_OWNED_NAME item is ignored. + + + + + attach_flags + + Specifies which metadata items should be attached to the answer. See + + kdbus.message + 7 + . + + + + + offset + + When the ioctl returns, this field will contain the offset of the + connection information inside the caller's pool. See + + kdbus.pool + 7 + + for further information. + + + + + info_size + + The kernel will return the size of the returned information, so + applications can optionally + + mmap + 2 + + specific parts of the pool. See + + kdbus.pool + 7 + + for further information. + + + + + items + + + The following items are expected for + KDBUS_CMD_CONN_INFO. + + + + KDBUS_ITEM_OWNED_NAME + + + Contains the well-known name of the connection to look up as. + This item is mandatory if the id field is + set to 0. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + When the ioctl returns, the following struct will be stored in the + caller's pool at offset. The fields in this struct + are described below. + + + +struct kdbus_info { + __u64 size; + __u64 id; + __u64 flags; + struct kdbus_item items[0]; +}; + + + + + size + + The overall size of the struct, including its items. + + + + + id + + The connection's unique ID. + + + + + flags + + The connection's flags as specified when it was created. + + + + + items + + + Depending on the flags field in + struct kdbus_cmd_info, items of types + KDBUS_ITEM_OWNED_NAME and + KDBUS_ITEM_CONN_DESCRIPTION may follow here. + KDBUS_ITEM_NEGOTIATE is also allowed. + + + + + + + Once the caller is finished with parsing the return buffer, it needs to + employ the KDBUS_CMD_FREE command for the offset, in + order to free the buffer part. See + + kdbus.pool + 7 + + for further information. + + + + + Getting information about a connection's bus creator + + The KDBUS_CMD_BUS_CREATOR_INFO ioctl takes the same + struct as KDBUS_CMD_CONN_INFO, but is used to + retrieve information about the creator of the bus the connection is + attached to. The metadata returned by this call is collected during the + creation of the bus and is never altered afterwards, so it provides + pristine information on the task that created the bus, at the moment when + it did so. + + + In response to this call, a slice in the connection's pool is allocated + and filled with an object of type struct kdbus_info, + pointed to by the ioctl's offset field. + + + +struct kdbus_info { + __u64 size; + __u64 id; + __u64 flags; + struct kdbus_item items[0]; +}; + + + + + size + + The overall size of the struct, including its items. + + + + + id + + The bus ID. + + + + + flags + + The bus flags as specified when it was created. + + + + + items + + + Metadata information is stored in items here. The item list + contains a KDBUS_ITEM_MAKE_NAME item that + indicates the bus name of the calling connection. + KDBUS_ITEM_NEGOTIATE is allowed to probe + for known item types. + + + + + + + Once the caller is finished with parsing the return buffer, it needs to + employ the KDBUS_CMD_FREE command for the offset, in + order to free the buffer part. See + + kdbus.pool + 7 + + for further information. + + + + + Updating connection details + + Some of a connection's details can be updated with the + KDBUS_CMD_CONN_UPDATE ioctl, using the file + descriptor that was used to create the connection. The update command + uses the following struct. + + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Currently, no flags are supported. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will return 0, + and the flags field is set to + 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + Items to describe the connection details to be updated. The + following item types are supported. + + + + KDBUS_ITEM_ATTACH_FLAGS_SEND + + + Supply a new set of metadata items that this connection + permits to be sent along with messages. + + + + + + KDBUS_ITEM_ATTACH_FLAGS_RECV + + + Supply a new set of metadata items that this connection + requests to be attached to each message. + + + + + + KDBUS_ITEM_NAME + KDBUS_ITEM_POLICY_ACCESS + + + Policy holder connections may supply a new set of policy + information with these items. For other connection types, + EOPNOTSUPP is returned in + errno. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + + Termination of connections + + A connection can be terminated by simply calling + + close + 2 + + on its file descriptor. All pending incoming messages will be discarded, + and the memory allocated by the pool will be freed. + + + + An alternative way of closing down a connection is via the + KDBUS_CMD_BYEBYE ioctl. This ioctl will succeed only + if the message queue of the connection is empty at the time of closing; + otherwise, the ioctl will fail with errno set to + EBUSY. When this ioctl returns + successfully, the connection has been terminated and won't accept any new + messages from remote peers. This way, a connection can be terminated + race-free, without losing any messages. The ioctl takes an argument of + type struct kdbus_cmd. + + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Currently, no flags are supported. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will fail with + errno set to EPROTO, and + the flags field is set to 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + The following item types are supported. + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_HELLO</constant> may fail with the following + errors + + + + + EFAULT + + The supplied pool size was 0 or not a multiple of the page size. + + + + + EINVAL + + The flags supplied in struct kdbus_cmd_hello + are invalid. + + + + + EINVAL + + An illegal combination of + KDBUS_HELLO_MONITOR, + KDBUS_HELLO_ACTIVATOR and + KDBUS_HELLO_POLICY_HOLDER was passed in + flags. + + + + + EINVAL + + An invalid set of items was supplied. + + + + + ECONNREFUSED + + The attach_flags_send field did not satisfy the requirements of + the bus. + + + + + EPERM + + A KDBUS_ITEM_CREDS items was supplied, but the + current user is not privileged. + + + + + ESHUTDOWN + + The bus you were trying to connect to has already been shut down. + + + + + EMFILE + + The maximum number of connections on the bus has been reached. + + + + + EOPNOTSUPP + + The endpoint does not support the connection flags supplied in + struct kdbus_cmd_hello. + + + + + + + + <constant>KDBUS_CMD_BYEBYE</constant> may fail with the following + errors + + + + + EALREADY + + The connection has already been shut down. + + + + + EBUSY + + There are still messages queued up in the connection's pool. + + + + + + + + <constant>KDBUS_CMD_CONN_INFO</constant> may fail with the following + errors + + + + + EINVAL + + Invalid flags, or neither an ID nor a name was provided, or the + name is invalid. + + + + + ESRCH + + Connection lookup by name failed. + + + + + ENXIO + + No connection with the provided connection ID found. + + + + + + + + <constant>KDBUS_CMD_CONN_UPDATE</constant> may fail with the following + errors + + + + + EINVAL + + Illegal flags or items. + + + + + EINVAL + + Wildcards submitted in policy entries, or illegal sequence + of policy items. + + + + + EOPNOTSUPP + + Operation not supported by connection. + + + + + E2BIG + + Too many policy items attached. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.policy + 7 + + + + + kdbus.pool + 7 + + + + + kdbus.item + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.endpoint.xml linux-4.2-pck/Documentation/kdbus/kdbus.endpoint.xml --- linux-4.2/Documentation/kdbus/kdbus.endpoint.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.endpoint.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,429 @@ + + + + + + + kdbus.endpoint + kdbus.endpoint + + + + kdbus.endpoint + 7 + + + + kdbus.endpoint + kdbus endpoint + + + + Description + + + Endpoints are entry points to a bus (see + + kdbus.bus + 7 + ). + By default, each bus has a default + endpoint called 'bus'. The bus owner has the ability to create custom + endpoints with specific names, permissions, and policy databases + (see below). An endpoint is presented as file underneath the directory + of the parent bus. + + + To create a custom endpoint, open the default endpoint + (bus) and use the + KDBUS_CMD_ENDPOINT_MAKE ioctl with + struct kdbus_cmd. Custom endpoints always have a policy + database that, by default, forbids any operation. You have to explicitly + install policy entries to allow any operation on this endpoint. + + + Once KDBUS_CMD_ENDPOINT_MAKE succeeded, the new + endpoint will appear in the filesystem + ( + kdbus.bus + 7 + ), and the used file descriptor will manage the + newly created endpoint resource. It cannot be used to manage further + resources and must be kept open as long as the endpoint is needed. The + endpoint will be terminated as soon as the file descriptor is closed. + + + Endpoint names may be chosen freely except for one restriction: the name + must be prefixed with the numeric effective UID of the creator and a dash. + This is required to avoid namespace clashes between different users. When + creating an endpoint, the name that is passed in must be properly + formatted or the kernel will refuse creation of the endpoint. Example: + 1047-my-endpoint is an acceptable name for an + endpoint registered by a user with UID 1047. However, + 1024-my-endpoint is not, and neither is + my-endpoint. The UID must be provided in the + user-namespace of the bus. + + + To create connections to a bus, use KDBUS_CMD_HELLO + on a file descriptor returned by open() on an + endpoint node. See + + kdbus.connection + 7 + + for further details. + + + + + Creating custom endpoints + + To create a new endpoint, the + KDBUS_CMD_ENDPOINT_MAKE command is used. Along with + the endpoint's name, which will be used to expose the endpoint in the + + kdbus.fs + 7 + , + the command also optionally takes items to set up the endpoint's + + kdbus.policy + 7 + . + KDBUS_CMD_ENDPOINT_MAKE takes a + struct kdbus_cmd argument. + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + The flags for creation. + + + KDBUS_MAKE_ACCESS_GROUP + + Make the endpoint file group-accessible. + + + + + KDBUS_MAKE_ACCESS_WORLD + + Make the endpoint file world-accessible. + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + The following items are expected for + KDBUS_CMD_ENDPOINT_MAKE. + + + + KDBUS_ITEM_MAKE_NAME + + Contains a string to identify the endpoint name. + + + + + KDBUS_ITEM_NAME + KDBUS_ITEM_POLICY_ACCESS + + + These items are used to set the policy attached to the + endpoint. For more details on bus and endpoint policies, see + + kdbus.policy + 7 + . + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + + Updating endpoints + + To update an existing endpoint, the + KDBUS_CMD_ENDPOINT_UPDATE command is used on the file + descriptor that was used to create the endpoint, using + KDBUS_CMD_ENDPOINT_MAKE. The only relevant detail of + the endpoint that can be updated is the policy. When the command is + employed, the policy of the endpoint is replaced + atomically with the new set of rules. + The command takes a struct kdbus_cmd argument. + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Unused for this command. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will return 0, + and the flags field is set to + 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + The following items are expected for + KDBUS_CMD_ENDPOINT_UPDATE. + + + + KDBUS_ITEM_NAME + KDBUS_ITEM_POLICY_ACCESS + + + These items are used to set the policy attached to the + endpoint. For more details on bus and endpoint policies, see + + kdbus.policy + 7 + . + Existing policy is atomically replaced with the new rules + provided. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_ENDPOINT_MAKE</constant> may fail with the + following errors + + + + + EINVAL + + The flags supplied in the struct kdbus_cmd + are invalid. + + + + + EINVAL + + Illegal combination of KDBUS_ITEM_NAME and + KDBUS_ITEM_POLICY_ACCESS was provided. + + + + + EEXIST + + An endpoint of that name already exists. + + + + + EPERM + + The calling user is not privileged. See + + kdbus + 7 + + for information about privileged users. + + + + + + + + <constant>KDBUS_CMD_ENDPOINT_UPDATE</constant> may fail with the + following errors + + + + + EINVAL + + The flags supplied in struct kdbus_cmd + are invalid. + + + + + EINVAL + + Illegal combination of KDBUS_ITEM_NAME and + KDBUS_ITEM_POLICY_ACCESS was provided. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.fs.xml linux-4.2-pck/Documentation/kdbus/kdbus.fs.xml --- linux-4.2/Documentation/kdbus/kdbus.fs.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.fs.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,124 @@ + + + + + + + kdbus.fs + kdbus.fs + + + + kdbus.fs + 7 + + + + kdbus.fs + kdbus file system + + + + File-system Layout + + + The kdbusfs pseudo filesystem provides access to + kdbus entities, such as buses and + endpoints. Each time the filesystem is mounted, + a new, isolated kdbus instance is created, which is independent from the + other instances. + + + The system-wide standard mount point for kdbusfs is + /sys/fs/kdbus. + + + + Buses are represented as directories in the file system layout, whereas + endpoints are exposed as files inside these directories. At the top-level, + a control node is present, which can be opened to + create new buses via the KDBUS_CMD_BUS_MAKE ioctl. + Each bus shows a default endpoint called + bus, which can be opened to either create a connection + with the KDBUS_CMD_HELLO ioctl, or to create new + custom endpoints for the bus with + KDBUS_CMD_ENDPOINT_MAKE. See + + kdbus.bus + 7 + , + + kdbus.connection + 7 + and + + kdbus.endpoint + 7 + + for more details. + + + Following, you can see an example layout of the + kdbusfs filesystem: + + + /sys/fs/kdbus/ ; mount-point + |-- 0-system ; bus directory + | |-- bus ; default endpoint + | `-- 1017-custom ; custom endpoint + |-- 1000-user ; bus directory + | |-- bus ; default endpoint + | |-- 1000-service-A ; custom endpoint + | `-- 1000-service-B ; custom endpoint + `-- control ; control file + + + + + Mounting instances + + In order to get a new and separate kdbus environment, a new instance + of kdbusfs can be mounted like this: + + + # mount -t kdbusfs kdbusfs /tmp/new_kdbus/ + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + mount + 8 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.item.xml linux-4.2-pck/Documentation/kdbus/kdbus.item.xml --- linux-4.2/Documentation/kdbus/kdbus.item.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.item.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,839 @@ + + + + + + + kdbus.item + kdbus item + + + + kdbus.item + 7 + + + + kdbus.item + kdbus item structure, layout and usage + + + + Description + + + To flexibly augment transport structures, data blobs of type + struct kdbus_item can be attached to the structs passed + into the ioctls. Some ioctls make items of certain types mandatory, + others are optional. Items that are unsupported by ioctls they are + attached to will cause the ioctl to fail with errno + set to EINVAL. + Items are also used for information stored in a connection's + pool, such as received messages, name lists or + requested connection or bus owner information. Depending on the type of + an item, its total size is either fixed or variable. + + + + Chaining items + + Whenever items are used as part of the kdbus kernel API, they are + embedded in structs that are embedded inside structs that themselves + include a size field containing the overall size of the structure. + This allows multiple items to be chained up, and an item iterator + (see below) is capable of detecting the end of an item chain. + + + + + Alignment + + The kernel expects all items to be aligned to 8-byte boundaries. + Unaligned items will cause the ioctl they are used with to fail + with errno set to EINVAL. + An item that has an unaligned size itself hence needs to be padded + if it is followed by another item. + + + + + Iterating items + + A simple iterator would iterate over the items until the items have + reached the embedding structure's overall size. An example + implementation is shown below. + + + size)) + +#define KDBUS_ITEM_FOREACH(item, head, first) \ + for ((item) = (head)->first; \ + ((uint8_t *)(item) < (uint8_t *)(head) + (head)->size) && \ + ((uint8_t *)(item) >= (uint8_t *)(head)); \ + (item) = KDBUS_ITEM_NEXT(item)) + ]]> + + + + + Item layout + + A struct kdbus_item consists of a + size field, describing its overall size, and a + type field, both 64 bit wide. They are followed by + a union to store information that is specific to the item's type. + The struct layout is shown below. + + + +struct kdbus_item { + __u64 size; + __u64 type; + /* item payload - see below */ + union { + __u8 data[0]; + __u32 data32[0]; + __u64 data64[0]; + char str[0]; + + __u64 id; + struct kdbus_vec vec; + struct kdbus_creds creds; + struct kdbus_pids pids; + struct kdbus_audit audit; + struct kdbus_caps caps; + struct kdbus_timestamp timestamp; + struct kdbus_name name; + struct kdbus_bloom_parameter bloom_parameter; + struct kdbus_bloom_filter bloom_filter; + struct kdbus_memfd memfd; + int fds[0]; + struct kdbus_notify_name_change name_change; + struct kdbus_notify_id_change id_change; + struct kdbus_policy_access policy_access; + }; +}; + + + + struct kdbus_item should never be used to allocate + an item instance, as its size may grow in future releases of the API. + Instead, it should be manually assembled by storing the + size, type and payload to a + struct of its own. + + + + + Item types + + + Negotiation item + + + KDBUS_ITEM_NEGOTIATE + + With this item is attached to any ioctl, programs can + probe the kernel for known item types. + The item carries an array of uint64_t values in + item.data64, each set to an item type to + probe. The kernel will reset each member of this array that is + not recognized as valid item type to 0. + This way, users can negotiate kernel features at start-up to + keep newer userspace compatible with older kernels. This item + is never attached by the kernel in response to any command. + + + + + + + Command specific items + + + KDBUS_ITEM_PAYLOAD_VEC + KDBUS_ITEM_PAYLOAD_OFF + + Messages are directly copied by the sending process into the + receiver's + + kdbus.pool + 7 + . + This way, two peers can exchange data by effectively doing a + single-copy from one process to another; the kernel will not buffer + the data anywhere else. KDBUS_ITEM_PAYLOAD_VEC + is used when sending message. The item + references a memory address when the payload data can be found. + KDBUS_ITEM_PAYLOAD_OFF is used when messages + are received, and the + offset value describes the offset inside the + receiving connection's + + kdbus.pool + 7 + + where the message payload can be found. See + + kdbus.message + 7 + + for more information on passing of payload data along with a + message. + +struct kdbus_vec { + __u64 size; + union { + __u64 address; + __u64 offset; + }; +}; + + + + + + KDBUS_ITEM_PAYLOAD_MEMFD + + Transports a file descriptor of a memfd in + struct kdbus_memfd in item.memfd. + The size field has to match the actual size of + the memfd that was specified when it was created. The + start parameter denotes the offset inside the + memfd at which the referenced payload starts. See + + kdbus.message + 7 + + for more information on passing of payload data along with a + message. + +struct kdbus_memfd { + __u64 start; + __u64 size; + int fd; + __u32 __pad; +}; + + + + + + KDBUS_ITEM_FDS + + Contains an array of file descriptors. + When used with KDBUS_CMD_SEND, the values of + this array must be filled with valid file descriptor numbers. + When received as item attached to a message, the array will + contain the numbers of the installed file descriptors, or + -1 in case an error occurred. + In either case, the number of entries in the array is derived from + the item's total size. See + + kdbus.message + 7 + + for more information. + + + + + + + Items specific to some commands + + + KDBUS_ITEM_CANCEL_FD + + Transports a file descriptor that can be used to cancel a + synchronous KDBUS_CMD_SEND operation by + writing to it. The file descriptor is stored in + item.fd[0]. The item may only contain one + file descriptor. See + + kdbus.message + 7 + + for more information on this item and how to use it. + + + + + KDBUS_ITEM_BLOOM_PARAMETER + + Contains a set of bloom parameters as + struct kdbus_bloom_parameter in + item.bloom_parameter. + The item is passed from userspace to kernel during the + KDBUS_CMD_BUS_MAKE ioctl, and returned + verbatim when KDBUS_CMD_HELLO is called. + The kernel does not use the bloom parameters, but they need to + be known by each connection on the bus in order to define the + bloom filter hash details. See + + kdbus.match + 7 + + for more information on matching and bloom filters. + +struct kdbus_bloom_parameter { + __u64 size; + __u64 n_hash; +}; + + + + + + KDBUS_ITEM_BLOOM_FILTER + + Carries a bloom filter as + struct kdbus_bloom_filter in + item.bloom_filter. It is mandatory to send this + item attached to a struct kdbus_msg, in case the + message is a signal. This item is never transported from kernel to + userspace. See + + kdbus.match + 7 + + for more information on matching and bloom filters. + +struct kdbus_bloom_filter { + __u64 generation; + __u64 data[0]; +}; + + + + + + KDBUS_ITEM_BLOOM_MASK + + Transports a bloom mask as binary data blob + stored in item.data. This item is used to + describe a match into a connection's match database. See + + kdbus.match + 7 + + for more information on matching and bloom filters. + + + + + KDBUS_ITEM_DST_NAME + + Contains a well-known name to send a + message to, as null-terminated string in + item.str. This item is used with + KDBUS_CMD_SEND. See + + kdbus.message + 7 + + for more information on how to send a message. + + + + + KDBUS_ITEM_MAKE_NAME + + Contains a bus name or + endpoint name, stored as null-terminated + string in item.str. This item is sent from + userspace to kernel when buses or endpoints are created, and + returned back to userspace when the bus creator information is + queried. See + + kdbus.bus + 7 + + and + + kdbus.endpoint + 7 + . + + + + + KDBUS_ITEM_ATTACH_FLAGS_SEND + KDBUS_ITEM_ATTACH_FLAGS_RECV + + Contains a set of attach flags at + send or receive time. See + + kdbus + 7 + , + + kdbus.bus + 7 + and + + kdbus.connection + 7 + + for more information on attach flags. + + + + + KDBUS_ITEM_ID + + Transports a connection's numerical ID of + a connection as uint64_t value in + item.id. + + + + + KDBUS_ITEM_NAME + + Transports a name associated with the + name registry as null-terminated string as + struct kdbus_name in + item.name. The flags + contains the flags of the name. See + + kdbus.name + 7 + + for more information on how to access the name registry of a bus. + +struct kdbus_name { + __u64 flags; + char name[0]; +}; + + + + + + + + Items attached by the kernel as metadata + + + + KDBUS_ITEM_TIMESTAMP + + Contains both the monotonic and the + realtime timestamp, taken when the message + was processed on the kernel side. + Stored as struct kdbus_timestamp in + item.timestamp. + +struct kdbus_timestamp { + __u64 seqnum; + __u64 monotonic_ns; + __u64 realtime_ns; +}; + + + + + + KDBUS_ITEM_CREDS + + Contains a set of user and + group information as 32-bit values, in the + usual four flavors: real, effective, saved and filesystem related. + Stored as struct kdbus_creds in + item.creds. + +struct kdbus_creds { + __u32 uid; + __u32 euid; + __u32 suid; + __u32 fsuid; + __u32 gid; + __u32 egid; + __u32 sgid; + __u32 fsgid; +}; + + + + + + KDBUS_ITEM_PIDS + + Contains the PID, TID + and parent PID (PPID) of a remote peer. + Stored as struct kdbus_pids in + item.pids. + +struct kdbus_pids { + __u64 pid; + __u64 tid; + __u64 ppid; +}; + + + + + + KDBUS_ITEM_AUXGROUPS + + Contains the auxiliary (supplementary) groups + a remote peer is a member of, stored as array of + uint32_t values in item.data32. + The array length can be determined by looking at the item's total + size, subtracting the size of the header and dividing the + remainder by sizeof(uint32_t). + + + + + KDBUS_ITEM_OWNED_NAME + + Contains a well-known name currently owned + by a connection. The name is stored as null-terminated string in + item.str. Its length can also be derived from + the item's total size. + + + + + KDBUS_ITEM_TID_COMM [*] + + Contains the comm string of a task's + TID (thread ID), stored as null-terminated + string in item.str. Its length can also be + derived from the item's total size. Receivers of this item should + not use its contents for any kind of security measures. See below. + + + + + KDBUS_ITEM_PID_COMM [*] + + Contains the comm string of a task's + PID (process ID), stored as null-terminated + string in item.str. Its length can also be + derived from the item's total size. Receivers of this item should + not use its contents for any kind of security measures. See below. + + + + + KDBUS_ITEM_EXE [*] + + Contains the path to the executable of a task, + stored as null-terminated string in item.str. Its + length can also be derived from the item's total size. Receivers of + this item should not use its contents for any kind of security + measures. See below. + + + + + KDBUS_ITEM_CMDLINE [*] + + Contains the command line arguments of a + task, stored as an array of null-terminated + strings in item.str. The total length of all + strings in the array can be derived from the item's total size. + Receivers of this item should not use its contents for any kind + of security measures. See below. + + + + + KDBUS_ITEM_CGROUP + + Contains the cgroup path of a task, stored + as null-terminated string in item.str. Its + length can also be derived from the item's total size. + + + + + KDBUS_ITEM_CAPS + + Contains sets of capabilities, stored as + struct kdbus_caps in item.caps. + As the item size may increase in the future, programs should be + written in a way that it takes + item.caps.last_cap into account, and derive + the number of sets and rows from the item size and the reported + number of valid capability bits. + +struct kdbus_caps { + __u32 last_cap; + __u32 caps[0]; +}; + + + + + + KDBUS_ITEM_SECLABEL + + Contains the LSM label of a task, stored as + null-terminated string in item.str. Its length + can also be derived from the item's total size. + + + + + KDBUS_ITEM_AUDIT + + Contains the audit sessionid and + loginuid of a task, stored as + struct kdbus_audit in + item.audit. + +struct kdbus_audit { + __u32 sessionid; + __u32 loginuid; +}; + + + + + + KDBUS_ITEM_CONN_DESCRIPTION + + Contains the connection description, as set + by KDBUS_CMD_HELLO or + KDBUS_CMD_CONN_UPDATE, stored as + null-terminated string in item.str. Its length + can also be derived from the item's total size. + + + + + + All metadata is automatically translated into the + namespaces of the task that receives them. See + + kdbus.message + 7 + + for more information. + + + + [*] Note that the content stored in metadata items of type + KDBUS_ITEM_TID_COMM, + KDBUS_ITEM_PID_COMM, + KDBUS_ITEM_EXE and + KDBUS_ITEM_CMDLINE + can easily be tampered by the sending tasks. Therefore, they should + not be used for any sort of security relevant + assumptions. The only reason they are transmitted is to let + receivers know about details that were set when metadata was + collected, even though the task they were collected from is not + active any longer when the items are received. + + + + + Items used for policy entries, matches and notifications + + + + KDBUS_ITEM_POLICY_ACCESS + + This item describes a policy access entry to + access the policy database of a + + kdbus.bus + 7 + or + + kdbus.endpoint + 7 + . + Please refer to + + kdbus.policy + 7 + + for more information on the policy database and how to access it. + +struct kdbus_policy_access { + __u64 type; + __u64 access; + __u64 id; +}; + + + + + + KDBUS_ITEM_ID_ADD + KDBUS_ITEM_ID_REMOVE + + This item is sent as attachment to a + kernel notification and indicates that a + new connection was created on the bus, or that a connection was + disconnected, respectively. It stores a + struct kdbus_notify_id_change in + item.id_change. + The id field contains the numeric ID of the + connection that was added or removed, and flags + is set to the connection flags, as passed by + KDBUS_CMD_HELLO. See + + kdbus.match + 7 + + and + + kdbus.message + 7 + + for more information on matches and notification messages. + +struct kdbus_notify_id_change { + __u64 id; + __u64 flags; +}; + + + + + + KDBUS_ITEM_NAME_ADD + KDBUS_ITEM_NAME_REMOVE + KDBUS_ITEM_NAME_CHANGE + + This item is sent as attachment to a + kernel notification and indicates that a + well-known name appeared, disappeared or + transferred to another owner on the bus. It stores a + struct kdbus_notify_name_change in + item.name_change. + old_id describes the former owner of the name + and is set to 0 values in case of + KDBUS_ITEM_NAME_ADD. + new_id describes the new owner of the name and + is set to 0 values in case of + KDBUS_ITEM_NAME_REMOVE. + The name field contains the well-known name the + notification is about, as null-terminated string. See + + kdbus.match + 7 + + and + + kdbus.message + 7 + + for more information on matches and notification messages. + +struct kdbus_notify_name_change { + struct kdbus_notify_id_change old_id; + struct kdbus_notify_id_change new_id; + char name[0]; +}; + + + + + + KDBUS_ITEM_REPLY_TIMEOUT + + This item is sent as attachment to a + kernel notification. It informs the receiver + that an expected reply to a message was not received in time. + The remote peer ID and the message cookie are stored in the message + header. See + + kdbus.message + 7 + + for more information about messages, timeouts and notifications. + + + + + KDBUS_ITEM_REPLY_DEAD + + This item is sent as attachment to a + kernel notification. It informs the receiver + that a remote connection a reply is expected from was disconnected + before that reply was sent. The remote peer ID and the message + cookie are stored in the message header. See + + kdbus.message + 7 + + for more information about messages, timeouts and notifications. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + memfd_create + 2 + + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.match.xml linux-4.2-pck/Documentation/kdbus/kdbus.match.xml --- linux-4.2/Documentation/kdbus/kdbus.match.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.match.xml 2015-09-08 00:48:22.208364829 -0300 @@ -0,0 +1,555 @@ + + + + + + + kdbus.match + kdbus.match + + + + kdbus.match + 7 + + + + kdbus.match + kdbus match + + + + Description + + + kdbus connections can install matches in order to subscribe to signal + messages sent on the bus. Such signal messages can be either directed + to a single connection (by setting a specific connection ID in + struct kdbus_msg.dst_id or by sending it to a + well-known name), or to potentially all currently + active connections on the bus (by setting + struct kdbus_msg.dst_id to + KDBUS_DST_ID_BROADCAST). + A signal message always has the KDBUS_MSG_SIGNAL + bit set in the flags bitfield. + Also, signal messages can originate from either the kernel (called + notifications), or from other bus connections. + In either case, a bus connection needs to have a suitable + match installed in order to receive any signal + message. Without any rules installed in the connection, no signal message + will be received. + + + + + Matches for signal messages from other connections + + Matches for messages from other connections (not kernel notifications) + are implemented as bloom filters (see below). The sender adds certain + properties of the message as elements to a bloom filter bit field, and + sends that along with the signal message. + + The receiving connection adds the message properties it is interested in + as elements to a bloom mask bit field, and uploads the mask as match rule, + possibly along with some other rules to further limit the match. + + The kernel will match the signal message's bloom filter against the + connection's bloom mask (simply by &-ing it), and will decide whether + the message should be delivered to a connection. + + + The kernel has no notion of any specific properties of the signal message, + all it sees are the bit fields of the bloom filter and the mask to match + against. The use of bloom filters allows simple and efficient matching, + without exposing any message properties or internals to the kernel side. + Clients need to deal with the fact that they might receive signal messages + which they did not subscribe to, as the bloom filter might allow + false-positives to pass the filter. + + To allow the future extension of the set of elements in the bloom filter, + the filter specifies a generation number. A later + generation must always contain all elements of the set of the previous + generation, but can add new elements to the set. The match rules mask can + carry an array with all previous generations of masks individually stored. + When the filter and mask are matched by the kernel, the mask with the + closest matching generation is selected as the index into the mask array. + + + + + Bloom filters + + Bloom filters allow checking whether a given word is present in a + dictionary. This allows connections to set up a mask for information it + is interested in, and will be delivered signal messages that have a + matching filter. + + For general information, see + the Wikipedia + article on bloom filters. + + + The size of the bloom filter is defined per bus when it is created, in + kdbus_bloom_parameter.size. All bloom filters attached + to signal messages on the bus must match this size, and all bloom filter + matches uploaded by connections must also match the size, or a multiple + thereof (see below). + + The calculation of the mask has to be done in userspace applications. The + kernel just checks the bitmasks to decide whether or not to let the + message pass. All bits in the mask must match the filter in and bit-wise + AND logic, but the mask may have more bits set than + the filter. Consequently, false positive matches are expected to happen, + and programs must deal with that fact by checking the contents of the + payload again at receive time. + + + Masks are entities that are always passed to the kernel as part of a + match (with an item of type KDBUS_ITEM_BLOOM_MASK), + and filters can be attached to signals, with an item of type + KDBUS_ITEM_BLOOM_FILTER. For a filter to match, all + its bits have to be set in the match mask as well. + + + For example, consider a bus that has a bloom size of 8 bytes, and the + following mask/filter combinations: + + matches + + filter 0x0303030303030303 + mask 0x0101010101010101 + -> doesn't match + + filter 0x0101010101010101 + mask 0x0303030303030303 + -> matches + ]]> + + + Hence, in order to catch all messages, a mask filled with + 0xff bytes can be installed as a wildcard match rule. + + + + Generations + + + Uploaded matches may contain multiple masks, which have to be as large + as the bloom filter size defined by the bus. Each block of a mask is + called a generation, starting at index 0. + + At match time, when a signal is about to be delivered, a bloom mask + generation is passed, which denotes which of the bloom masks the filter + should be matched against. This allows programs to provide backward + compatible masks at upload time, while older clients can still match + against older versions of filters. + + + + + + Matches for kernel notifications + + To receive kernel generated notifications (see + + kdbus.message + 7 + ), + a connection must install match rules that are different from + the bloom filter matches described in the section above. They can be + filtered by the connection ID that caused the notification to be sent, by + one of the names it currently owns, or by the type of the notification + (ID/name add/remove/change). + + + + + Adding a match + + To add a match, the KDBUS_CMD_MATCH_ADD ioctl is + used, which takes a struct kdbus_cmd_match as an argument + described below. + + Note that each of the items attached to this command will internally + create one match rule, and the collection of them, + which is submitted as one block via the ioctl, is called a + match. To allow a message to pass, all rules of a + match have to be satisfied. Hence, adding more items to the command will + only narrow the possibility of a match to effectively let the message + pass, and will decrease the chance that the connection's process will be + woken up needlessly. + + Multiple matches can be installed per connection. As long as one of it has + a set of rules which allows the message to pass, this one will be + decisive. + + + +struct kdbus_cmd_match { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 cookie; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + Flags to control the behavior of the ioctl. + + + KDBUS_MATCH_REPLACE + + Make the endpoint file group-accessible + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + cookie + + A cookie which identifies the match, so it can be referred to when + removing it. + + + + + items + + + Items to define the actual rules of the matches. The following item + types are expected. Each item will create one new match rule. + + + + KDBUS_ITEM_BLOOM_MASK + + + An item that carries the bloom filter mask to match against + in its data field. The payload size must match the bloom + filter size that was specified when the bus was created. + See the "Bloom filters" section above for more information on + bloom filters. + + + + + + KDBUS_ITEM_NAME + + + When used as part of kernel notifications, this item specifies + a name that is acquired, lost or that changed its owner (see + below). When used as part of a match for user-generated signal + messages, it specifies a name that the sending connection must + own at the time of sending the signal. + + + + + + KDBUS_ITEM_ID + + + Specify a sender connection's ID that will match this rule. + For kernel notifications, this specifies the ID of a + connection that was added to or removed from the bus. + For used-generated signals, it specifies the ID of the + connection that sent the signal message. + + + + + + KDBUS_ITEM_NAME_ADD + KDBUS_ITEM_NAME_REMOVE + KDBUS_ITEM_NAME_CHANGE + + + These items request delivery of kernel notifications that + describe a name acquisition, loss, or change. The details + are stored in the item's + kdbus_notify_name_change member. + All information specified must be matched in order to make + the message pass. Use + KDBUS_MATCH_ID_ANY to + match against any unique connection ID. + + + + + + KDBUS_ITEM_ID_ADD + KDBUS_ITEM_ID_REMOVE + + + These items request delivery of kernel notifications that are + generated when a connection is created or terminated. + struct kdbus_notify_id_change is used to + store the actual match information. This item can be used to + monitor one particular connection ID, or, when the ID field + is set to KDBUS_MATCH_ID_ANY, + all of them. + + + + + + KDBUS_ITEM_NEGOTIATE + + With this item, programs can probe the + kernel for known item types. See + + kdbus.item + 7 + + for more details. + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + Refer to + + kdbus.message + 7 + + for more information on message types. + + + + + Removing a match + + Matches can be removed with the + KDBUS_CMD_MATCH_REMOVE ioctl, which takes + struct kdbus_cmd_match as argument, but its fields + usage slightly differs compared to that of + KDBUS_CMD_MATCH_ADD. + + + +struct kdbus_cmd_match { + __u64 size; + __u64 cookie; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + cookie + + The cookie of the match, as it was passed when the match was added. + All matches that have this cookie will be removed. + + + + + flags + + No flags are supported for this use case. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will fail with + -1, errno is set to + EPROTO, and the flags field + is set to 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + No items are supported for this use case, but + KDBUS_ITEM_NEGOTIATE is allowed nevertheless. + + + + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_MATCH_ADD</constant> may fail with the following + errors + + + + + EINVAL + + Illegal flags or items. + + + + + EDOM + + Illegal bloom filter size. + + + + + EMFILE + + Too many matches for this connection. + + + + + + + + <constant>KDBUS_CMD_MATCH_REMOVE</constant> may fail with the following + errors + + + + + EINVAL + + Illegal flags. + + + + + EBADSLT + + A match entry with the given cookie could not be found. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.match + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.message.xml linux-4.2-pck/Documentation/kdbus/kdbus.message.xml --- linux-4.2/Documentation/kdbus/kdbus.message.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.message.xml 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,1276 @@ + + + + + + + kdbus.message + kdbus.message + + + + kdbus.message + 7 + + + + kdbus.message + kdbus message + + + + Description + + + A kdbus message is used to exchange information between two connections + on a bus, or to transport notifications from the kernel to one or many + connections. This document describes the layout of messages, how payload + is added to them and how they are sent and received. + + + + + Message layout + + The layout of a message is shown below. + + + +-------------------------------------------------------------------------+ + | Message | + | +---------------------------------------------------------------------+ | + | | Header | | + | | size: overall message size, including the data records | | + | | destination: connection ID of the receiver | | + | | source: connection ID of the sender (set by kernel) | | + | | payload_type: "DBusDBus" textual identifier stored as uint64_t | | + | +---------------------------------------------------------------------+ | + | +---------------------------------------------------------------------+ | + | | Data Record | | + | | size: overall record size (without padding) | | + | | type: type of data | | + | | data: reference to data (address or file descriptor) | | + | +---------------------------------------------------------------------+ | + | +---------------------------------------------------------------------+ | + | | padding bytes to the next 8 byte alignment | | + | +---------------------------------------------------------------------+ | + | +---------------------------------------------------------------------+ | + | | Data Record | | + | | size: overall record size (without padding) | | + | | ... | | + | +---------------------------------------------------------------------+ | + | +---------------------------------------------------------------------+ | + | | padding bytes to the next 8 byte alignment | | + | +---------------------------------------------------------------------+ | + | +---------------------------------------------------------------------+ | + | | Data Record | | + | | size: overall record size | | + | | ... | | + | +---------------------------------------------------------------------+ | + | ... further data records ... | + +-------------------------------------------------------------------------+ + + + + + Message payload + + + When connecting to the bus, receivers request a memory pool of a given + size, large enough to carry all backlog of data enqueued for the + connection. The pool is internally backed by a shared memory file which + can be mmap()ed by the receiver. See + + kdbus.pool + 7 + + for more information. + + + + Message payload must be described in items attached to a message when + it is sent. A receiver can access the payload by looking at the items + that are attached to a message in its pool. The following items are used. + + + + + KDBUS_ITEM_PAYLOAD_VEC + + + This item references a piece of memory on the sender side which is + directly copied into the receiver's pool. This way, two peers can + exchange data by effectively doing a single-copy from one process + to another; the kernel will not buffer the data anywhere else. + This item is never found in a message received by a connection. + + + + + + KDBUS_ITEM_PAYLOAD_OFF + + + This item is attached to messages on the receiving side and points + to a memory area inside the receiver's pool. The + offset variable in the item denotes the memory + location relative to the message itself. + + + + + + KDBUS_ITEM_PAYLOAD_MEMFD + + + Messages can reference memfd files which + contain the data. memfd files are tmpfs-backed files that allow + sealing of the content of the file, which prevents all writable + access to the file content. + + + Only memfds that have + (F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_SEAL) + + set are accepted as payload data, which enforces reliable passing of + data. The receiver can assume that neither the sender nor anyone + else can alter the content after the message is sent. If those + seals are not set on the memfd, the ioctl will fail with + -1, and errno will be + set to ETXTBUSY. + + + + + + KDBUS_ITEM_FDS + + + Messages can transport regular file descriptors via + KDBUS_ITEM_FDS. This item carries an array + of int values in item.fd. The + maximum number of file descriptors in the item is + 253, and only one item of this type is + accepted per message. All passed values must be valid file + descriptors; the open count of each file descriptors is increased + by installing it to the receiver's task. This item can only be + used for directed messages, not for broadcasts, and only to + remote peers that have opted-in for receiving file descriptors + at connection time (KDBUS_HELLO_ACCEPT_FD). + + + + + + + The sender must not make any assumptions on the type in which data is + received by the remote peer. The kernel is free to re-pack multiple + KDBUS_ITEM_PAYLOAD_VEC and + KDBUS_ITEM_PAYLOAD_MEMFD payloads. For instance, the + kernel may decide to merge multiple VECs into a + single VEC, inline MEMFD + payloads into memory, or merge all passed VECs into a + single MEMFD. However, the kernel preserves the order + of passed data. This means that the order of all VEC + and MEMFD items is not changed in respect to each + other. In other words: All passed VEC and + MEMFD data payloads are treated as a single stream + of data that may be received by the remote peer in a different set of + chunks than it was sent as. + + + + + Sending messages + + + Messages are passed to the kernel with the + KDBUS_CMD_SEND ioctl. Depending on the destination + address of the message, the kernel delivers the message to the specific + destination connection, or to some subset of all connections on the same + bus. Sending messages across buses is not possible. Messages are always + queued in the memory pool of the destination connection (see above). + + + + The KDBUS_CMD_SEND ioctl uses a + struct kdbus_cmd_send to describe the message + transfer. + + +struct kdbus_cmd_send { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 msg_address; + struct kdbus_msg_info reply; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + Flags for message delivery + + + KDBUS_SEND_SYNC_REPLY + + + By default, all calls to kdbus are considered asynchronous, + non-blocking. However, as there are many use cases that need + to wait for a remote peer to answer a method call, there's a + way to send a message and wait for a reply in a synchronous + fashion. This is what the + KDBUS_SEND_SYNC_REPLY controls. The + KDBUS_CMD_SEND ioctl will block until the + reply has arrived, the timeout limit is reached, in case the + remote connection was shut down, or if interrupted by a signal + before any reply; see + + signal + 7 + . + + The offset of the reply message in the sender's pool is stored + in reply when the ioctl has returned without + error. Hence, there is no need for another + KDBUS_CMD_RECV ioctl or anything else to + receive the reply. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Request a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will fail with + -1, errno + is set to EPROTO. + Once the ioctl returned, the flags + field will have all bits set that the kernel recognizes as + valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + msg_address + + In this field, users have to provide a pointer to a message + (struct kdbus_msg) to send. See below for a + detailed description. + + + + + reply + + Only used for synchronous replies. See description of + struct kdbus_cmd_recv for more details. + + + + + items + + + The following items are currently recognized. + + + + KDBUS_ITEM_CANCEL_FD + + + When this optional item is passed in, and the call is + executed as SYNC call, the passed in file descriptor can be + used as alternative cancellation point. The kernel will call + + poll + 2 + + on this file descriptor, and once it reports any incoming + bytes, the blocking send operation will be canceled; the + blocking, synchronous ioctl call will return + -1, and errno will + be set to ECANCELED. + Any type of file descriptor on which + + poll + 2 + + can be called on can be used as payload to this item; for + example, an eventfd can be used for this purpose, see + + eventfd + 2 + . + For asynchronous message sending, this item is allowed but + ignored. + + + + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + The message referenced by the msg_address above has + the following layout. + + + +struct kdbus_msg { + __u64 size; + __u64 flags; + __s64 priority; + __u64 dst_id; + __u64 src_id; + __u64 payload_type; + __u64 cookie; + __u64 timeout_ns; + __u64 cookie_reply; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + Flags to describe message details. + + + KDBUS_MSG_EXPECT_REPLY + + + Expect a reply to this message from the remote peer. With + this bit set, the timeout_ns field must be set to a non-zero + number of nanoseconds in which the receiving peer is expected + to reply. If such a reply is not received in time, the sender + will be notified with a timeout message (see below). The + value must be an absolute value, in nanoseconds and based on + CLOCK_MONOTONIC. + + For a message to be accepted as reply, it must be a direct + message to the original sender (not a broadcast and not a + signal message), and its + kdbus_msg.cookie_reply must match the + previous message's kdbus_msg.cookie. + + Expected replies also temporarily open the policy of the + sending connection, so the other peer is allowed to respond + within the given time window. + + + + + + KDBUS_MSG_NO_AUTO_START + + + By default, when a message is sent to an activator + connection, the activator is notified and will start an + implementer. This flag inhibits that behavior. With this bit + set, and the remote being an activator, the ioctl will fail + with errno set to + EADDRNOTAVAIL. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + priority + + The priority of this message. Receiving messages (see below) may + optionally be constrained to messages of a minimal priority. This + allows for use cases where timing critical data is interleaved with + control data on the same connection. If unused, the priority field + should be set to 0. + + + + + dst_id + + The numeric ID of the destination connection, or + KDBUS_DST_ID_BROADCAST + (~0ULL) to address every peer on the bus, or + KDBUS_DST_ID_NAME (0) to look + it up dynamically from the bus' name registry. + In the latter case, an item of type + KDBUS_ITEM_DST_NAME is mandatory. + Also see + + kdbus.name + 7 + + . + + + + + src_id + + Upon return of the ioctl, this member will contain the sending + connection's numerical ID. Should be 0 at send time. + + + + + payload_type + + Type of the payload in the actual data records. Currently, only + KDBUS_PAYLOAD_DBUS is accepted as input value + of this field. When receiving messages that are generated by the + kernel (notifications), this field will contain + KDBUS_PAYLOAD_KERNEL. + + + + + cookie + + Cookie of this message, for later recognition. Also, when replying + to a message (see above), the cookie_reply + field must match this value. + + + + + timeout_ns + + If the message sent requires a reply from the remote peer (see above), + this field contains the timeout in absolute nanoseconds based on + CLOCK_MONOTONIC. Also see + + clock_gettime + 2 + . + + + + + cookie_reply + + If the message sent is a reply to another message, this field must + match the cookie of the formerly received message. + + + + + items + + + A dynamically sized list of items to contain additional information. + The following items are expected/valid: + + + + KDBUS_ITEM_PAYLOAD_VEC + KDBUS_ITEM_PAYLOAD_MEMFD + KDBUS_ITEM_FDS + + + Actual data records containing the payload. See section + "Message payload". + + + + + + KDBUS_ITEM_BLOOM_FILTER + + + Bloom filter for matches (see below). + + + + + + KDBUS_ITEM_DST_NAME + + + Well-known name to send this message to. Required if + dst_id is set to + KDBUS_DST_ID_NAME. + If a connection holding the given name can't be found, + the ioctl will fail with errno set to + ESRCH is returned. + + + For messages to a unique name (ID), this item is optional. If + present, the kernel will make sure the name owner matches the + given unique name. This allows programs to tie the message + sending to the condition that a name is currently owned by a + certain unique name. + + + + + + + + + + The message will be augmented by the requested metadata items when + queued into the receiver's pool. See + + kdbus.connection + 7 + + and + + kdbus.item + 7 + + for more information on metadata. + + + + + Receiving messages + + + Messages are received by the client with the + KDBUS_CMD_RECV ioctl. The endpoint file of the bus + supports poll()/epoll()/select(); when new messages + are available on the connection's file descriptor, + POLLIN is reported. For compatibility reasons, + POLLOUT is always reported as well. Note, however, + that the latter does not guarantee that a message can in fact be sent, as + this depends on how many pending messages the receiver has in its pool. + + + + With the KDBUS_CMD_RECV ioctl, a + struct kdbus_cmd_recv is used. + + + +struct kdbus_cmd_recv { + __u64 size; + __u64 flags; + __u64 return_flags; + __s64 priority; + __u64 dropped_msgs; + struct kdbus_msg_info msg; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + Flags to control the receive command. + + + KDBUS_RECV_PEEK + + + Just return the location of the next message. Do not install + file descriptors or anything else. This is usually used to + determine the sender of the next queued message. + + + + + + KDBUS_RECV_DROP + + + Drop the next message without doing anything else with it, + and free the pool slice. This a short-cut for + KDBUS_RECV_PEEK and + KDBUS_CMD_FREE. + + + + + + KDBUS_RECV_USE_PRIORITY + + + Dequeue the messages ordered by their priority, and filtering + them with the priority field (see below). + + + + + + KDBUS_FLAG_NEGOTIATE + + + Request a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will fail with + -1, errno + is set to EPROTO. + Once the ioctl returned, the flags + field will have all bits set that the kernel recognizes as + valid for this command. + + + + + + + + + return_flags + + Flags returned by the kernel. If the dropped_msgs + field is non-zero, KDBUS_RECV_RETURN_DROPPED_MSGS + is set. If a file descriptor could not be installed, the + KDBUS_RECV_RETURN_INCOMPLETE_FDS flag is set. + + + + + priority + + With KDBUS_RECV_USE_PRIORITY set in + flags, messages will be dequeued ordered by their + priority, starting with the highest value. Also, messages will be + filtered by the value given in this field, so the returned message + will at least have the requested priority. If no such message is + waiting in the queue, the ioctl will fail, and + errno will be set to EAGAIN. + + + + + dropped_msgs + + Whenever a message with KDBUS_MSG_SIGNAL is sent + but cannot be queued on a peer (e.g., as it contains FDs but the peer + does not support FDs, or there is no space left in the peer's pool) + the 'dropped_msgs' counter of the peer is incremented. On the next + RECV ioctl, the 'dropped_msgs' field is copied into the ioctl struct + and cleared on the peer. If it was non-zero, the + KDBUS_RECV_RETURN_DROPPED_MSGS flag will be set + in return_flags. Note that this will only happen + if the ioctl succeeded or failed with EAGAIN. In + other error cases, the 'dropped_msgs' field of the peer is left + untouched. + + + + + msg + + Embedded struct containing information on the received message when + this command succeeded (see below). + + + + + items + + Items to specify further details for the receive command. + Currently unused, and all items will be rejected with + errno set to EINVAL. + + + + + + Both struct kdbus_cmd_recv and + struct kdbus_cmd_send embed + struct kdbus_msg_info. + For the KDBUS_CMD_SEND ioctl, it is used to catch + synchronous replies, if one was requested, and is unused otherwise. + + + +struct kdbus_msg_info { + __u64 offset; + __u64 msg_size; + __u64 return_flags; +}; + + + The fields in this struct are described below. + + + + offset + + Upon return of the ioctl, this field contains the offset in the + receiver's memory pool. The memory must be freed with + KDBUS_CMD_FREE. See + + kdbus.pool + 7 + + for further details. + + + + + msg_size + + Upon successful return of the ioctl, this field contains the size of + the allocated slice at offset offset. + It is the combination of the size of the stored + struct kdbus_msg object plus all appended VECs. + You can use it in combination with offset to map + a single message, instead of mapping the entire pool. See + + kdbus.pool + 7 + + for further details. + + + + + return_flags + + + Kernel-provided return flags. Currently, the following flags are + defined. + + + + KDBUS_RECV_RETURN_INCOMPLETE_FDS + + + The message contained memfds or file descriptors, and the + kernel failed to install one or more of them at receive time. + Most probably that happened because the maximum number of + file descriptors for the receiver's task were exceeded. + In such cases, the message is still delivered, so this is not + a fatal condition. File descriptors numbers inside the + KDBUS_ITEM_FDS item or memfd files + referenced by KDBUS_ITEM_PAYLOAD_MEMFD + items which could not be installed will be set to + -1. + + + + + + + + + + Unless KDBUS_RECV_DROP was passed, the + offset field contains the location of the new message + inside the receiver's pool after the KDBUS_CMD_RECV + ioctl was employed. The message is stored as struct kdbus_msg + at this offset, and can be interpreted with the semantics described above. + + + Also, if the connection allowed for file descriptor to be passed + (KDBUS_HELLO_ACCEPT_FD), and if the message contained + any, they will be installed into the receiving process when the + KDBUS_CMD_RECV ioctl is called. + memfds may always be part of the message payload. + The receiving task is obliged to close all file descriptors appropriately + once no longer needed. If KDBUS_RECV_PEEK is set, no + file descriptors are installed. This allows for peeking at a message, + looking at its metadata only and dropping it via + KDBUS_RECV_DROP, without installing any of the file + descriptors into the receiving process. + + + The caller is obliged to call the KDBUS_CMD_FREE + ioctl with the returned offset when the memory is no longer needed. + + + + + Notifications + + A kernel notification is a regular kdbus message with the following + details. + + + + + kdbus_msg.src_id == KDBUS_SRC_ID_KERNEL + + + kdbus_msg.dst_id == KDBUS_DST_ID_BROADCAST + + + kdbus_msg.payload_type == KDBUS_PAYLOAD_KERNEL + + + Has exactly one of the items attached that are described below. + + + Always has a timestamp item (KDBUS_ITEM_TIMESTAMP) + attached. + + + + + The kernel will notify its users of the following events. + + + + + When connection A is terminated while connection + B is waiting for a reply from it, connection + B is notified with a message with an item of + type KDBUS_ITEM_REPLY_DEAD. + + + + When connection A does not receive a reply from + connection B within the specified timeout window, + connection A will receive a message with an + item of type KDBUS_ITEM_REPLY_TIMEOUT. + + + + When an ordinary connection (not a monitor) is created on or removed + from a bus, messages with an item of type + KDBUS_ITEM_ID_ADD or + KDBUS_ITEM_ID_REMOVE, respectively, are delivered + to all bus members that match these messages through their match + database. Eavesdroppers (monitor connections) do not cause such + notifications to be sent. They are invisible on the bus. + + + + When a connection gains or loses ownership of a name, messages with an + item of type KDBUS_ITEM_NAME_ADD, + KDBUS_ITEM_NAME_REMOVE or + KDBUS_ITEM_NAME_CHANGE are delivered to all bus + members that match these messages through their match database. + + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_SEND</constant> may fail with the following + errors + + + + + EOPNOTSUPP + + The connection is not an ordinary connection, or the passed + file descriptors in KDBUS_ITEM_FDS item are + either kdbus handles or unix domain sockets. Both are currently + unsupported. + + + + + EINVAL + + The submitted payload type is + KDBUS_PAYLOAD_KERNEL, + KDBUS_MSG_EXPECT_REPLY was set without timeout + or cookie values, KDBUS_SEND_SYNC_REPLY was + set without KDBUS_MSG_EXPECT_REPLY, an invalid + item was supplied, src_id was non-zero and was + different from the current connection's ID, a supplied memfd had a + size of 0, or a string was not properly null-terminated. + + + + + ENOTUNIQ + + The supplied destination is + KDBUS_DST_ID_BROADCAST and either + file descriptors were passed, or + KDBUS_MSG_EXPECT_REPLY was set, + or a timeout was given. + + + + + E2BIG + + Too many items. + + + + + EMSGSIZE + + The size of the message header and items or the payload vector + is excessive. + + + + + EEXIST + + Multiple KDBUS_ITEM_FDS, + KDBUS_ITEM_BLOOM_FILTER or + KDBUS_ITEM_DST_NAME items were supplied. + + + + + EBADF + + The supplied KDBUS_ITEM_FDS or + KDBUS_ITEM_PAYLOAD_MEMFD items + contained an illegal file descriptor. + + + + + EMEDIUMTYPE + + The supplied memfd is not a sealed kdbus memfd. + + + + + EMFILE + + Too many file descriptors inside a + KDBUS_ITEM_FDS. + + + + + EBADMSG + + An item had illegal size, both a dst_id and a + KDBUS_ITEM_DST_NAME was given, or both a name + and a bloom filter was given. + + + + + ETXTBSY + + The supplied kdbus memfd file cannot be sealed or the seal + was removed, because it is shared with other processes or + still mapped with + + mmap + 2 + . + + + + + ECOMM + + A peer does not accept the file descriptors addressed to it. + + + + + EFAULT + + The supplied bloom filter size was not 64-bit aligned, or supplied + memory could not be accessed by the kernel. + + + + + EDOM + + The supplied bloom filter size did not match the bloom filter + size of the bus. + + + + + EDESTADDRREQ + + dst_id was set to + KDBUS_DST_ID_NAME, but no + KDBUS_ITEM_DST_NAME was attached. + + + + + ESRCH + + The name to look up was not found in the name registry. + + + + + EADDRNOTAVAIL + + KDBUS_MSG_NO_AUTO_START was given but the + destination connection is an activator. + + + + + ENXIO + + The passed numeric destination connection ID couldn't be found, + or is not connected. + + + + + ECONNRESET + + The destination connection is no longer active. + + + + + ETIMEDOUT + + Timeout while synchronously waiting for a reply. + + + + + EINTR + + Interrupted system call while synchronously waiting for a reply. + + + + + EPIPE + + When sending a message, a synchronous reply from the receiving + connection was expected but the connection died before answering. + + + + + ENOBUFS + + Too many pending messages on the receiver side. + + + + + EREMCHG + + Both a well-known name and a unique name (ID) was given, but + the name is not currently owned by that connection. + + + + + EXFULL + + The memory pool of the receiver is full. + + + + + EREMOTEIO + + While synchronously waiting for a reply, the remote peer + failed with an I/O error. + + + + + + + + <constant>KDBUS_CMD_RECV</constant> may fail with the following + errors + + + + + EOPNOTSUPP + + The connection is not an ordinary connection, or the passed + file descriptors are either kdbus handles or unix domain + sockets. Both are currently unsupported. + + + + + EINVAL + + Invalid flags or offset. + + + + + EAGAIN + + No message found in the queue. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + clock_gettime + 2 + + + + + ioctl + 2 + + + + + poll + 2 + + + + + select + 2 + + + + + epoll + 7 + + + + + eventfd + 2 + + + + + memfd_create + 2 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.name.xml linux-4.2-pck/Documentation/kdbus/kdbus.name.xml --- linux-4.2/Documentation/kdbus/kdbus.name.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.name.xml 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,711 @@ + + + + + + + kdbus.name + kdbus.name + + + + kdbus.name + 7 + + + + kdbus.name + kdbus.name + + + + Description + + Each + + kdbus.bus + 7 + + instantiates a name registry to resolve well-known names into unique + connection IDs for message delivery. The registry will be queried when a + message is sent with kdbus_msg.dst_id set to + KDBUS_DST_ID_NAME, or when a registry dump is + requested with KDBUS_CMD_NAME_LIST. + + + + All of the below is subject to policy rules for SEE + and OWN permissions. See + + kdbus.policy + 7 + + for more information. + + + + + Name validity + + A name has to comply with the following rules in order to be considered + valid. + + + + + + The name has two or more elements separated by a + '.' (period) character. + + + + + All elements must contain at least one character. + + + + + Each element must only contain the ASCII characters + [A-Z][a-z][0-9]_ and must not begin with a + digit. + + + + + The name must contain at least one '.' (period) + character (and thus at least two elements). + + + + + The name must not begin with a '.' (period) + character. + + + + + The name must not exceed 255 characters in + length. + + + + + + + Acquiring a name + + To acquire a name, a client uses the + KDBUS_CMD_NAME_ACQUIRE ioctl with + struct kdbus_cmd as argument. + + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + Flags to control details in the name acquisition. + + + KDBUS_NAME_REPLACE_EXISTING + + + Acquiring a name that is already present usually fails, + unless this flag is set in the call, and + KDBUS_NAME_ALLOW_REPLACEMENT (see below) + was set when the current owner of the name acquired it, or + if the current owner is an activator connection (see + + kdbus.connection + 7 + ). + + + + + + KDBUS_NAME_ALLOW_REPLACEMENT + + + Allow other connections to take over this name. When this + happens, the former owner of the connection will be notified + of the name loss. + + + + + + KDBUS_NAME_QUEUE + + + A name that is already acquired by a connection can not be + acquired again (unless the + KDBUS_NAME_ALLOW_REPLACEMENT flag was + set during acquisition; see above). + However, a connection can put itself in a queue of + connections waiting for the name to be released. Once that + happens, the first connection in that queue becomes the new + owner and is notified accordingly. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Request a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will fail with + -1, and errno + is set to EPROTO. + Once the ioctl returned, the flags + field will have all bits set that the kernel recognizes as + valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + + Flags returned by the kernel. Currently, the following may be + returned by the kernel. + + + + KDBUS_NAME_IN_QUEUE + + + The name was not acquired yet, but the connection was + placed in the queue of peers waiting for the name. + This can only happen if KDBUS_NAME_QUEUE + was set in the flags member (see above). + The connection will receive a name owner change notification + once the current owner has given up the name and its + ownership was transferred. + + + + + + + + + items + + + Items to submit the name. Currently, one item of type + KDBUS_ITEM_NAME is expected and allowed, and + the contained string must be a valid bus name. + KDBUS_ITEM_NEGOTIATE may be used to probe for + valid item types. See + + kdbus.item + 7 + + for a detailed description of how this item is used. + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to >EINVAL. + + + + + + + + Releasing a name + + A connection may release a name explicitly with the + KDBUS_CMD_NAME_RELEASE ioctl. If the connection was + an implementer of an activatable name, its pending messages are moved + back to the activator. If there are any connections queued up as waiters + for the name, the first one in the queue (the oldest entry) will become + the new owner. The same happens implicitly for all names once a + connection terminates. See + + kdbus.connection + 7 + + for more information on connections. + + + The KDBUS_CMD_NAME_RELEASE ioctl uses the same data + structure as the acquisition call + (KDBUS_CMD_NAME_ACQUIRE), + but with slightly different field usage. + + + +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Flags to the command. Currently unused. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will return 0, + and the flags field is set to + 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + items + + + Items to submit the name. Currently, one item of type + KDBUS_ITEM_NAME is expected and allowed, and + the contained string must be a valid bus name. + KDBUS_ITEM_NEGOTIATE may be used to probe for + valid item types. See + + kdbus.item + 7 + + for a detailed description of how this item is used. + + + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + + + + + + + + Dumping the name registry + + A connection may request a complete or filtered dump of currently active + bus names with the KDBUS_CMD_LIST ioctl, which + takes a struct kdbus_cmd_list as argument. + + + +struct kdbus_cmd_list { + __u64 flags; + __u64 return_flags; + __u64 offset; +}; + + + The fields in this struct are described below. + + + + flags + + + Any combination of flags to specify which names should be dumped. + + + + KDBUS_LIST_UNIQUE + + + List the unique (numeric) IDs of the connection, whether it + owns a name or not. + + + + + + KDBUS_LIST_NAMES + + + List well-known names stored in the database which are + actively owned by a real connection (not an activator). + + + + + + KDBUS_LIST_ACTIVATORS + + + List names that are owned by an activator. + + + + + + KDBUS_LIST_QUEUED + + + List connections that are not yet owning a name but are + waiting for it to become available. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Request a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will fail with + -1, and errno + is set to EPROTO. + Once the ioctl returned, the flags + field will have all bits set that the kernel recognizes as + valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + offset + + When the ioctl returns successfully, the offset to the name registry + dump inside the connection's pool will be stored in this field. + + + + + + The returned list of names is stored in a struct kdbus_list + that in turn contains an array of type struct kdbus_info, + The array-size in bytes is given as list_size. + The fields inside struct kdbus_info is described next. + + + +struct kdbus_info { + __u64 size; + __u64 id; + __u64 flags; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + id + + The owning connection's unique ID. + + + + + flags + + The flags of the owning connection. + + + + + items + + + Items containing the actual name. Currently, one item of type + KDBUS_ITEM_OWNED_NAME will be attached, + including the name's flags. In that item, the flags field of the + name may carry the following bits: + + + + KDBUS_NAME_ALLOW_REPLACEMENT + + + Other connections are allowed to take over this name from the + connection that owns it. + + + + + + KDBUS_NAME_IN_QUEUE + + + When retrieving a list of currently acquired names in the + registry, this flag indicates whether the connection + actually owns the name or is currently waiting for it to + become available. + + + + + + KDBUS_NAME_ACTIVATOR + + + An activator connection owns a name as a placeholder for an + implementer, which is started on demand by programs as soon + as the first message arrives. There's some more information + on this topic in + + kdbus.connection + 7 + + . + + + In contrast to + KDBUS_NAME_REPLACE_EXISTING, + when a name is taken over from an activator connection, all + the messages that have been queued in the activator + connection will be moved over to the new owner. The activator + connection will still be tracked for the name and will take + control again if the implementer connection terminates. + + + This flag can not be used when acquiring a name, but is + implicitly set through KDBUS_CMD_HELLO + with KDBUS_HELLO_ACTIVATOR set in + kdbus_cmd_hello.conn_flags. + + + + + + KDBUS_FLAG_NEGOTIATE + + + Requests a set of valid flags for this ioctl. When this bit is + set, no action is taken; the ioctl will return + 0, and the flags + field will have all bits set that are valid for this command. + The KDBUS_FLAG_NEGOTIATE bit will be + cleared by the operation. + + + + + + + + + + The returned buffer must be freed with the + KDBUS_CMD_FREE ioctl when the user is finished with + it. See + + kdbus.pool + 7 + + for more information. + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_NAME_ACQUIRE</constant> may fail with the following + errors + + + + + EINVAL + + Illegal command flags, illegal name provided, or an activator + tried to acquire a second name. + + + + + EPERM + + Policy prohibited name ownership. + + + + + EALREADY + + Connection already owns that name. + + + + + EEXIST + + The name already exists and can not be taken over. + + + + + E2BIG + + The maximum number of well-known names per connection is exhausted. + + + + + + + + <constant>KDBUS_CMD_NAME_RELEASE</constant> + may fail with the following errors + + + + + EINVAL + + Invalid command flags, or invalid name provided. + + + + + ESRCH + + Name is not found in the registry. + + + + + EADDRINUSE + + Name is owned by a different connection and can't be released. + + + + + + + + <constant>KDBUS_CMD_LIST</constant> may fail with the following + errors + + + + + EINVAL + + Invalid command flags + + + + + ENOBUFS + + No available memory in the connection's pool. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.item + 7 + + + + + kdbus.policy + 7 + + + + + kdbus.pool + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.policy.xml linux-4.2-pck/Documentation/kdbus/kdbus.policy.xml --- linux-4.2/Documentation/kdbus/kdbus.policy.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.policy.xml 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,406 @@ + + + + + + + kdbus.policy + kdbus.policy + + + + kdbus.policy + 7 + + + + kdbus.policy + kdbus policy + + + + Description + + + A kdbus policy restricts the possibilities of connections to own, see and + talk to well-known names. A policy can be associated with a bus (through a + policy holder connection) or a custom endpoint. kdbus stores its policy + information in a database that can be accessed through the following + ioctl commands: + + + + + KDBUS_CMD_HELLO + + When creating, or updating, a policy holder connection. See + + kdbus.connection + 7 + . + + + + + KDBUS_CMD_ENDPOINT_MAKE + KDBUS_CMD_ENDPOINT_UPDATE + + When creating, or updating, a bus custom endpoint. See + + kdbus.endpoint + 7 + . + + + + + + In all cases, the name and policy access information is stored in items + of type KDBUS_ITEM_NAME and + KDBUS_ITEM_POLICY_ACCESS. For this transport, the + following rules apply. + + + + + + An item of type KDBUS_ITEM_NAME must be followed + by at least one KDBUS_ITEM_POLICY_ACCESS item. + + + + + + An item of type KDBUS_ITEM_NAME can be followed + by an arbitrary number of + KDBUS_ITEM_POLICY_ACCESS items. + + + + + + An arbitrary number of groups of names and access levels can be given. + + + + + + Names passed in items of type KDBUS_ITEM_NAME must + comply to the rules of valid kdbus.name. See + + kdbus.name + 7 + + for more information. + + The payload of an item of type + KDBUS_ITEM_POLICY_ACCESS is defined by the following + struct. For more information on the layout of items, please refer to + + kdbus.item + 7 + . + + + +struct kdbus_policy_access { + __u64 type; + __u64 access; + __u64 id; +}; + + + The fields in this struct are described below. + + + + type + + + One of the following. + + + + + KDBUS_POLICY_ACCESS_USER + + Grant access to a user with the UID stored in the + id field. + + + + + KDBUS_POLICY_ACCESS_GROUP + + Grant access to a user with the GID stored in the + id field. + + + + + KDBUS_POLICY_ACCESS_WORLD + + Grant access to everyone. The id field + is ignored. + + + + + + + + access + + + The access to grant. One of the following. + + + + + KDBUS_POLICY_SEE + + Allow the name to be seen. + + + + + KDBUS_POLICY_TALK + + Allow the name to be talked to. + + + + + KDBUS_POLICY_OWN + + Allow the name to be owned. + + + + + + + + id + + For KDBUS_POLICY_ACCESS_USER, stores the UID. + For KDBUS_POLICY_ACCESS_GROUP, stores the GID. + + + + + + + All endpoints of buses have an empty policy database by default. + Therefore, unless policy rules are added, all operations will also be + denied by default. Also see + + kdbus.endpoint + 7 + . + + + + + Wildcard names + + Policy holder connections may upload names that contain the wildcard + suffix (".*"). Such a policy entry is effective for + every well-known name that extends the provided name by exactly one more + level. + + For example, the name foo.bar.* matches both + "foo.bar.baz" and + "foo.bar.bazbaz" are, but not + "foo.bar.baz.baz". + + This allows connections to take control over multiple names that the + policy holder doesn't need to know about when uploading the policy. + + Such wildcard entries are not allowed for custom endpoints. + + + + + Privileged connections + + The policy database is overruled when action is taken by a privileged + connection. Please refer to + + kdbus.connection + 7 + + for more information on what makes a connection privileged. + + + + + Examples + + For instance, a set of policy rules may look like this: + + + +KDBUS_ITEM_NAME: str='org.foo.bar' +KDBUS_ITEM_POLICY_ACCESS: type=USER, access=OWN, ID=1000 +KDBUS_ITEM_POLICY_ACCESS: type=USER, access=TALK, ID=1001 +KDBUS_ITEM_POLICY_ACCESS: type=WORLD, access=SEE + +KDBUS_ITEM_NAME: str='org.blah.baz' +KDBUS_ITEM_POLICY_ACCESS: type=USER, access=OWN, ID=0 +KDBUS_ITEM_POLICY_ACCESS: type=WORLD, access=TALK + + + + That means that 'org.foo.bar' may only be owned by UID 1000, but every + user on the bus is allowed to see the name. However, only UID 1001 may + actually send a message to the connection and receive a reply from it. + + The second rule allows 'org.blah.baz' to be owned by UID 0 only, but + every user may talk to it. + + + + + TALK access and multiple well-known names per connection + + Note that TALK access is checked against all names of a connection. For + example, if a connection owns both 'org.foo.bar' and + 'org.blah.baz', and the policy database allows + 'org.blah.baz' to be talked to by WORLD, then this + permission is also granted to 'org.foo.bar'. That + might sound illogical, but after all, we allow messages to be directed to + either the ID or a well-known name, and policy is applied to the + connection, not the name. In other words, the effective TALK policy for a + connection is the most permissive of all names the connection owns. + + For broadcast messages, the receiver needs TALK permissions to the sender + to receive the broadcast. + + + Both the endpoint and the bus policy databases are consulted to allow + name registry listing, owning a well-known name and message delivery. + If either one fails, the operation is failed with + errno set to EPERM. + + For best practices, connections that own names with a restricted TALK + access should not install matches. This avoids cases where the sent + message may pass the bloom filter due to false-positives and may also + satisfy the policy rules. + + Also see + + kdbus.match + 7 + . + + + + + Implicit policies + + Depending on the type of the endpoint, a set of implicit rules that + override installed policies might be enforced. + + On default endpoints, the following set is enforced and checked before + any user-supplied policy is checked. + + + + + + Privileged connections always override any installed policy. Those + connections could easily install their own policies, so there is no + reason to enforce installed policies. + + + + + Connections can always talk to connections of the same user. This + includes broadcast messages. + + + + + + Custom endpoints have stricter policies. The following rules apply: + + + + + + Policy rules are always enforced, even if the connection is a + privileged connection. + + + + + Policy rules are always enforced for TALK access, + even if both ends are running under the same user. This includes + broadcast messages. + + + + + To restrict the set of names that can be seen, endpoint policies can + install SEE policies. + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.pool.xml linux-4.2-pck/Documentation/kdbus/kdbus.pool.xml --- linux-4.2/Documentation/kdbus/kdbus.pool.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.pool.xml 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,326 @@ + + + + + + + kdbus.pool + kdbus.pool + + + + kdbus.pool + 7 + + + + kdbus.pool + kdbus pool + + + + Description + + A pool for data received from the kernel is installed for every + connection of the bus, and + is sized according to the information stored in the + pool_size member of struct kdbus_cmd_hello + when KDBUS_CMD_HELLO is employed. Internally, the + pool is segmented into slices, each referenced by its + offset in the pool, expressed in bytes. + See + + kdbus.connection + 7 + + for more information about KDBUS_CMD_HELLO. + + + + The pool is written to by the kernel when one of the following + ioctls is issued: + + + + KDBUS_CMD_HELLO + + ... to receive details about the bus the connection was made to + + + + KDBUS_CMD_RECV + + ... to receive a message + + + + KDBUS_CMD_LIST + + ... to dump the name registry + + + + KDBUS_CMD_CONN_INFO + + ... to retrieve information on a connection + + + + KDBUS_CMD_BUS_CREATOR_INFO + + ... to retrieve information about a connection's bus creator + + + + + + + The offset fields returned by either one of the + aforementioned ioctls describe offsets inside the pool. In order to make + the slice available for subsequent calls, + KDBUS_CMD_FREE has to be called on that offset + (see below). Otherwise, the pool will fill up, and the connection won't + be able to receive any more information through its pool. + + + + + Pool slice allocation + + Pool slices are allocated by the kernel in order to report information + back to a task, such as messages, returned name list etc. + Allocation of pool slices cannot be initiated by userspace. See + + kdbus.connection + 7 + + and + + kdbus.name + 7 + + for examples of commands that use the pool to + return data. + + + + + Accessing the pool memory + + Memory in the pool is read-only for userspace and may only be written + to by the kernel. To read from the pool memory, the caller is expected to + + mmap + 2 + + the buffer into its task, like this: + + +uint8_t *buf = mmap(NULL, size, PROT_READ, MAP_SHARED, conn_fd, 0); + + + + In order to map the entire pool, the size parameter in + the example above should be set to the value of the + pool_size member of + struct kdbus_cmd_hello when + KDBUS_CMD_HELLO was employed to create the + connection (see above). + + + + The file descriptor used to map the memory must be + the one that was used to create the connection. + In other words, the one that was used to call + KDBUS_CMD_HELLO. See + + kdbus.connection + 7 + + for more details. + + + + Alternatively, instead of mapping the entire pool buffer, only parts + of it can be mapped. Every kdbus command that returns an + offset (see above) also reports a + size along with it, so programs can be written + in a way that it only maps portions of the pool to access a specific + slice. + + + + When access to the pool memory is no longer needed, programs should + call munmap() on the pointer returned by + mmap(). + + + + + Freeing pool slices + + The KDBUS_CMD_FREE ioctl is used to free a slice + inside the pool, describing an offset that was returned in an + offset field of another ioctl struct. + The KDBUS_CMD_FREE command takes a + struct kdbus_cmd_free as argument. + + + +struct kdbus_cmd_free { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 offset; + struct kdbus_item items[0]; +}; + + + The fields in this struct are described below. + + + + size + + The overall size of the struct, including its items. + + + + + flags + + Currently unused. + KDBUS_FLAG_NEGOTIATE is accepted to probe for + valid flags. If set, the ioctl will return 0, + and the flags field is set to + 0. + + + + + return_flags + + Flags returned by the kernel. Currently unused and always set to + 0 by the kernel. + + + + + offset + + The offset to free, as returned by other ioctls that allocated + memory for returned information. + + + + + items + + Items to specify further details for the receive command. + Currently unused. + Unrecognized items are rejected, and the ioctl will fail with + errno set to EINVAL. + All items except for + KDBUS_ITEM_NEGOTIATE (see + + kdbus.item + 7 + + ) will be rejected. + + + + + + + Return value + + On success, all mentioned ioctl commands return 0; + on error, -1 is returned, and + errno is set to indicate the error. + If the issued ioctl is illegal for the file descriptor used, + errno will be set to ENOTTY. + + + + + <constant>KDBUS_CMD_FREE</constant> may fail with the following + errors + + + + + ENXIO + + No pool slice found at given offset. + + + + + EINVAL + + Invalid flags provided. + + + + + EINVAL + + The offset is valid, but the user is not allowed to free the slice. + This happens, for example, if the offset was retrieved with + KDBUS_RECV_PEEK. + + + + + + + + See Also + + + + kdbus + 7 + + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.name + 7 + + + + + mmap + 2 + + + + + munmap + 2 + + + + + diff -Nur linux-4.2/Documentation/kdbus/kdbus.xml linux-4.2-pck/Documentation/kdbus/kdbus.xml --- linux-4.2/Documentation/kdbus/kdbus.xml 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/kdbus.xml 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,1012 @@ + + + + + + + kdbus + kdbus + + + + kdbus + 7 + + + + kdbus + Kernel Message Bus + + + + Synopsis + + kdbus is an inter-process communication bus system controlled by the + kernel. It provides user-space with an API to create buses and send + unicast and multicast messages to one, or many, peers connected to the + same bus. It does not enforce any layout on the transmitted data, but + only provides the transport layer used for message interchange between + peers. + + + This set of man-pages gives a comprehensive overview of the kernel-level + API, with all ioctl commands, associated structs and bit masks. However, + most people will not use this API level directly, but rather let one of + the high-level abstraction libraries help them integrate D-Bus + functionality into their applications. + + + + + Description + + kdbus provides a pseudo filesystem called kdbusfs, + which is usually mounted on /sys/fs/kdbus. Bus + primitives can be accessed as files and sub-directories underneath this + mount-point. Any advanced operations are done via + ioctl() on files created by + kdbusfs. Multiple mount-points of + kdbusfs are independent of each other. This allows + namespacing of kdbus by mounting a new instance of + kdbusfs in a new mount-namespace. kdbus calls these + mount instances domains and each bus belongs to exactly one domain. + + + + kdbus was designed as a transport layer for D-Bus, but is in no way + limited, nor controlled by the D-Bus protocol specification. The D-Bus + protocol is one possible application layer on top of kdbus. + + + + For the general D-Bus protocol specification, its payload format, its + marshaling, and its communication semantics, please refer to the + + D-Bus specification. + + + + + + Terminology + + + Domain + + A domain is a kdbusfs mount-point containing all + the bus primitives. Each domain is independent, and separate domains + do not affect each other. + + + + + Bus + + A bus is a named object inside a domain. Clients exchange messages + over a bus. Multiple buses themselves have no connection to each other; + messages can only be exchanged on the same bus. The default endpoint of + a bus, to which clients establish connections, is the "bus" file + /sys/fs/kdbus/<bus name>/bus. + Common operating system setups create one "system bus" per system, + and one "user bus" for every logged-in user. Applications or services + may create their own private buses. The kernel driver does not + distinguish between different bus types, they are all handled the same + way. See + + kdbus.bus + 7 + + for more details. + + + + + Endpoint + + An endpoint provides a file to talk to a bus. Opening an endpoint + creates a new connection to the bus to which the endpoint belongs. All + endpoints have unique names and are accessible as files underneath the + directory of a bus, e.g., /sys/fs/kdbus/<bus>/<endpoint> + Every bus has a default endpoint called "bus". + A bus can optionally offer additional endpoints with custom names + to provide restricted access to the bus. Custom endpoints carry + additional policy which can be used to create sandboxes with + locked-down, limited, filtered access to a bus. See + + kdbus.endpoint + 7 + + for more details. + + + + + Connection + + A connection to a bus is created by opening an endpoint file of a + bus. Every ordinary client connection has a unique identifier on the + bus and can address messages to every other connection on the same + bus by using the peer's connection ID as the destination. See + + kdbus.connection + 7 + + for more details. + + + + + Pool + + Each connection allocates a piece of shmem-backed memory that is + used to receive messages and answers to ioctl commands from the kernel. + It is never used to send anything to the kernel. In order to access that + memory, an application must mmap() it into its address space. See + + kdbus.pool + 7 + + for more details. + + + + + Well-known Name + + A connection can, in addition to its implicit unique connection ID, + request the ownership of a textual well-known name. Well-known names are + noted in reverse-domain notation, such as com.example.service1. A + connection that offers a service on a bus is usually reached by its + well-known name. An analogy of connection ID and well-known name is an + IP address and a DNS name associated with that address. See + + kdbus.name + 7 + + for more details. + + + + + Message + + Connections can exchange messages with other connections by addressing + the peers with their connection ID or well-known name. A message + consists of a message header with information on how to route the + message, and the message payload, which is a logical byte stream of + arbitrary size. Messages can carry additional file descriptors to be + passed from one connection to another, just like passing file + descriptors over UNIX domain sockets. Every connection can specify which + set of metadata the kernel should attach to the message when it is + delivered to the receiving connection. Metadata contains information + like: system time stamps, UID, GID, TID, proc-starttime, well-known + names, process comm, process exe, process argv, cgroup, capabilities, + seclabel, audit session, loginuid and the connection's human-readable + name. See + + kdbus.message + 7 + + for more details. + + + + + Item + + The API of kdbus implements the notion of items, submitted through and + returned by most ioctls, and stored inside data structures in the + connection's pool. See + + kdbus.item + 7 + + for more details. + + + + + Broadcast, signal, filter, match + + Signals are messages that a receiver opts in for by installing a blob of + bytes, called a 'match'. Signal messages must always carry a + counter-part blob, called a 'filter', and signals are only delivered to + peers which have a match that white-lists the message's filter. Senders + of signal messages can use either a single connection ID as receiver, + or the special connection ID + KDBUS_DST_ID_BROADCAST to potentially send it to + all connections of a bus, following the logic described above. See + + kdbus.match + 7 + + and + + kdbus.message + 7 + + for more details. + + + + + Policy + + A policy is a set of rules that define which connections can see, talk + to, or register a well-known name on the bus. A policy is attached to + buses and custom endpoints, and modified by policy holder connections or + owners of custom endpoints. See + + kdbus.policy + 7 + + for more details. + + + + + Privileged bus users + + A user connecting to the bus is considered privileged if it is either + the creator of the bus, or if it has the CAP_IPC_OWNER capability flag + set. See + + kdbus.connection + 7 + + for more details. + + + + + + Bus Layout + + + A bus provides and defines an environment that peers + can connect to for message interchange. A bus is created via the kdbus + control interface and can be modified by the bus creator. It applies the + policy that control all bus operations. The bus creator itself does not + participate as a peer. To establish a peer + connection, you have to open one of the + endpoints of a bus. Each bus provides a default + endpoint, but further endpoints can be created on-demand. Endpoints are + used to apply additional policies for all connections on this endpoint. + Thus, they provide additional filters to further restrict access of + specific connections to the bus. + + + + Following, you can see an example bus layout: + + + + + + + + Data structures and interconnections + + + + + Metadata + + + When metadata is collected + + kdbus records data about the system in certain situations. Such metadata + can refer to the currently active process (creds, PIDs, current user + groups, process names and its executable path, cgroup membership, + capabilities, security label and audit information), connection + information (description string, currently owned names) and time stamps. + + + Metadata is collected at the following times. + + + + + When a bus is created (KDBUS_CMD_MAKE), + information about the calling task is collected. This data is returned + by the kernel via the KDBUS_CMD_BUS_CREATOR_INFO + call. + + + + + When a connection is created (KDBUS_CMD_HELLO), + information about the calling task is collected. Alternatively, a + privileged connection may provide 'faked' information about + credentials, PIDs and security labels which will be stored instead. + This data is returned by the kernel as information on a connection + (KDBUS_CMD_CONN_INFO). Only metadata that a + connection allowed to be sent (by setting its bit in + attach_flags_send) will be exported in this way. + + + + + + When a message is sent (KDBUS_CMD_SEND), + information about the sending task and the sending connection is + collected. This metadata will be attached to the message when it + arrives in the receiver's pool. If the connection sending the + message installed faked credentials (see + + kdbus.connection + 7 + ), + the message will not be augmented by any information about the + currently sending task. Note that only metadata that was requested + by the receiving connection will be collected and attached to + messages. + + + + + + Which metadata items are actually delivered depends on the following + sets and masks: + + + + + (a) the system-wide kmod creds mask + (module parameter attach_flags_mask) + + + + (b) the per-connection send creds mask, set by the connecting client + + + + (c) the per-connection receive creds mask, set by the connecting + client + + + + (d) the per-bus minimal creds mask, set by the bus creator + + + + (e) the per-bus owner creds mask, set by the bus creator + + + + (f) the mask specified when querying creds of a bus peer + + + + (g) the mask specified when querying creds of a bus owner + + + + + With the following rules: + + + + + + [1] The creds attached to messages are determined as + a & b & c. + + + + + + [2] When connecting to a bus (KDBUS_CMD_HELLO), + and ~b & d != 0, the call will fail with, + -1, and errno is set to + ECONNREFUSED. + + + + + + [3] When querying creds of a bus peer, the creds returned are + a & b & f. + + + + + + [4] When querying creds of a bus owner, the creds returned are + a & e & g. + + + + + + Hence, programs might not always get all requested metadata items that + it requested. Code must be written so that it can cope with this fact. + + + + + Benefits and heads-up + + Attaching metadata to messages has two major benefits. + + + + + Metadata attached to messages is gathered at the moment when the + other side calls KDBUS_CMD_SEND, or, + respectively, then the kernel notification is generated. There is + no need for the receiving peer to retrieve information about the + task in a second step. This closes a race gap that would otherwise + be inherent. + + + + + As metadata is delivered along with messages in the same data + blob, no extra calls to kernel functions etc. are needed to gather + them. + + + + + Note, however, that collecting metadata does come at a price for + performance, so developers should carefully assess which metadata to + really opt-in for. For best practice, data that is not needed as part + of a message should not be requested by the connection in the first + place (see attach_flags_recv in + KDBUS_CMD_HELLO). + + + + + Attach flags for metadata items + + To let the kernel know which metadata information to attach as items + to the aforementioned commands, it uses a bitmask. In those, the + following attach flags are currently supported. + Both the attach_flags_recv and + attach_flags_send fields of + struct kdbus_cmd_hello, as well as the payload of the + KDBUS_ITEM_ATTACH_FLAGS_SEND and + KDBUS_ITEM_ATTACH_FLAGS_RECV items follow this + scheme. + + + + + KDBUS_ATTACH_TIMESTAMP + + Requests the attachment of an item of type + KDBUS_ITEM_TIMESTAMP. + + + + + KDBUS_ATTACH_CREDS + + Requests the attachment of an item of type + KDBUS_ITEM_CREDS. + + + + + KDBUS_ATTACH_PIDS + + Requests the attachment of an item of type + KDBUS_ITEM_PIDS. + + + + + KDBUS_ATTACH_AUXGROUPS + + Requests the attachment of an item of type + KDBUS_ITEM_AUXGROUPS. + + + + + KDBUS_ATTACH_NAMES + + Requests the attachment of an item of type + KDBUS_ITEM_OWNED_NAME. + + + + + KDBUS_ATTACH_TID_COMM + + Requests the attachment of an item of type + KDBUS_ITEM_TID_COMM. + + + + + KDBUS_ATTACH_PID_COMM + + Requests the attachment of an item of type + KDBUS_ITEM_PID_COMM. + + + + + KDBUS_ATTACH_EXE + + Requests the attachment of an item of type + KDBUS_ITEM_EXE. + + + + + KDBUS_ATTACH_CMDLINE + + Requests the attachment of an item of type + KDBUS_ITEM_CMDLINE. + + + + + KDBUS_ATTACH_CGROUP + + Requests the attachment of an item of type + KDBUS_ITEM_CGROUP. + + + + + KDBUS_ATTACH_CAPS + + Requests the attachment of an item of type + KDBUS_ITEM_CAPS. + + + + + KDBUS_ATTACH_SECLABEL + + Requests the attachment of an item of type + KDBUS_ITEM_SECLABEL. + + + + + KDBUS_ATTACH_AUDIT + + Requests the attachment of an item of type + KDBUS_ITEM_AUDIT. + + + + + KDBUS_ATTACH_CONN_DESCRIPTION + + Requests the attachment of an item of type + KDBUS_ITEM_CONN_DESCRIPTION. + + + + + + Please refer to + + kdbus.item + 7 + + for detailed information about the layout and payload of items and + what metadata should be used to. + + + + + + The ioctl interface + + + As stated in the 'synopsis' section above, application developers are + strongly encouraged to use kdbus through one of the high-level D-Bus + abstraction libraries, rather than using the low-level API directly. + + + + kdbus on the kernel level exposes its functions exclusively through + + ioctl + 2 + , + employed on file descriptors returned by + + open + 2 + + on pseudo files exposed by + + kdbus.fs + 7 + . + + + Following is a list of all the ioctls, along with the command structs + they must be used with. + + + + + + + ioctl signature + command + transported struct + + + + + 0x40189500 + KDBUS_CMD_BUS_MAKE + struct kdbus_cmd * + + 0x40189510 + KDBUS_CMD_ENDPOINT_MAKE + struct kdbus_cmd * + + 0xc0609580 + KDBUS_CMD_HELLO + struct kdbus_cmd_hello * + + 0x40189582 + KDBUS_CMD_BYEBYE + struct kdbus_cmd * + + 0x40389590 + KDBUS_CMD_SEND + struct kdbus_cmd_send * + + 0x80409591 + KDBUS_CMD_RECV + struct kdbus_cmd_recv * + + 0x40209583 + KDBUS_CMD_FREE + struct kdbus_cmd_free * + + 0x401895a0 + KDBUS_CMD_NAME_ACQUIRE + struct kdbus_cmd * + + 0x401895a1 + KDBUS_CMD_NAME_RELEASE + struct kdbus_cmd * + + 0x80289586 + KDBUS_CMD_LIST + struct kdbus_cmd_list * + + 0x80309584 + KDBUS_CMD_CONN_INFO + struct kdbus_cmd_info * + + 0x40209551 + KDBUS_CMD_UPDATE + struct kdbus_cmd * + + 0x80309585 + KDBUS_CMD_BUS_CREATOR_INFO + struct kdbus_cmd_info * + + 0x40189511 + KDBUS_CMD_ENDPOINT_UPDATE + struct kdbus_cmd * + + 0x402095b0 + KDBUS_CMD_MATCH_ADD + struct kdbus_cmd_match * + + 0x402095b1 + KDBUS_CMD_MATCH_REMOVE + struct kdbus_cmd_match * + + + + + + + Depending on the type of kdbusfs node that was + opened and what ioctls have been executed on a file descriptor before, + a different sub-set of ioctl commands is allowed. + + + + + + On a file descriptor resulting from opening a + control node, only the + KDBUS_CMD_BUS_MAKE ioctl may be executed. + + + + + On a file descriptor resulting from opening a + bus endpoint node, only the + KDBUS_CMD_ENDPOINT_MAKE and + KDBUS_CMD_HELLO ioctls may be executed. + + + + + A file descriptor that was used to create a bus + (via KDBUS_CMD_BUS_MAKE) is called a + bus owner file descriptor. The bus will be + active as long as the file descriptor is kept open. + A bus owner file descriptor can not be used to + employ any further ioctls. As soon as + + close + 2 + + is called on it, the bus will be shut down, along will all associated + endpoints and connections. See + + kdbus.bus + 7 + + for more details. + + + + + A file descriptor that was used to create an endpoint + (via KDBUS_CMD_ENDPOINT_MAKE) is called an + endpoint owner file descriptor. The endpoint + will be active as long as the file descriptor is kept open. + An endpoint owner file descriptor can only be used + to update details of an endpoint through the + KDBUS_CMD_ENDPOINT_UPDATE ioctl. As soon as + + close + 2 + + is called on it, the endpoint will be removed from the bus, and all + connections that are connected to the bus through it are shut down. + See + + kdbus.endpoint + 7 + + for more details. + + + + + A file descriptor that was used to create a connection + (via KDBUS_CMD_HELLO) is called a + connection owner file descriptor. The connection + will be active as long as the file descriptor is kept open. + A connection owner file descriptor may be used to + issue any of the following ioctls. + + + + + KDBUS_CMD_UPDATE to tweak details of the + connection. See + + kdbus.connection + 7 + . + + + + KDBUS_CMD_BYEBYE to shut down a connection + without losing messages. See + + kdbus.connection + 7 + . + + + + KDBUS_CMD_FREE to free a slice of memory in + the pool. See + + kdbus.pool + 7 + . + + + + KDBUS_CMD_CONN_INFO to retrieve information + on other connections on the bus. See + + kdbus.connection + 7 + . + + + + KDBUS_CMD_BUS_CREATOR_INFO to retrieve + information on the bus creator. See + + kdbus.connection + 7 + . + + + + KDBUS_CMD_LIST to retrieve a list of + currently active well-known names and unique IDs on the bus. See + + kdbus.name + 7 + . + + + + KDBUS_CMD_SEND and + KDBUS_CMD_RECV to send or receive a message. + See + + kdbus.message + 7 + . + + + + KDBUS_CMD_NAME_ACQUIRE and + KDBUS_CMD_NAME_RELEASE to acquire or release + a well-known name on the bus. See + + kdbus.name + 7 + . + + + + KDBUS_CMD_MATCH_ADD and + KDBUS_CMD_MATCH_REMOVE to add or remove + a match for signal messages. See + + kdbus.match + 7 + . + + + + + + + These ioctls, along with the structs they transport, are explained in + detail in the other documents linked to in the "See Also" section below. + + + + + See Also + + + + kdbus.bus + 7 + + + + + kdbus.connection + 7 + + + + + kdbus.endpoint + 7 + + + + + kdbus.fs + 7 + + + + + kdbus.item + 7 + + + + + kdbus.message + 7 + + + + + kdbus.name + 7 + + + + + kdbus.pool + 7 + + + + + ioctl + 2 + + + + + mmap + 2 + + + + + open + 2 + + + + + close + 2 + + + + D-Bus + + + + + diff -Nur linux-4.2/Documentation/kdbus/stylesheet.xsl linux-4.2-pck/Documentation/kdbus/stylesheet.xsl --- linux-4.2/Documentation/kdbus/stylesheet.xsl 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/kdbus/stylesheet.xsl 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,16 @@ + + + 1 + ansi + 80 + 0 + A4 + 2 + 1 + 1 + + + diff -Nur linux-4.2/Documentation/kernel-parameters.txt linux-4.2-pck/Documentation/kernel-parameters.txt --- linux-4.2/Documentation/kernel-parameters.txt 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/Documentation/kernel-parameters.txt 2015-09-08 11:20:31.767046152 -0300 @@ -3896,6 +3896,9 @@ HIGHMEM regardless of setting of CONFIG_HIGHPTE. + uuid_debug= (Boolean) whether to enable debugging of TuxOnIce's + uuid support. + vdso= [X86,SH] On X86_32, this is an alias for vdso32=. Otherwise: diff -Nur linux-4.2/Documentation/power/tuxonice-internals.txt linux-4.2-pck/Documentation/power/tuxonice-internals.txt --- linux-4.2/Documentation/power/tuxonice-internals.txt 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/power/tuxonice-internals.txt 2015-09-08 11:20:31.793711515 -0300 @@ -0,0 +1,532 @@ + TuxOnIce 4.0 Internal Documentation. + Updated to 23 March 2015 + +(Please note that incremental image support mentioned in this document is work +in progress. This document may need updating prior to the actual release of +4.0!) + +1. Introduction. + + TuxOnIce 4.0 is an addition to the Linux Kernel, designed to + allow the user to quickly shutdown and quickly boot a computer, without + needing to close documents or programs. It is equivalent to the + hibernate facility in some laptops. This implementation, however, + requires no special BIOS or hardware support. + + The code in these files is based upon the original implementation + prepared by Gabor Kuti and additional work by Pavel Machek and a + host of others. This code has been substantially reworked by Nigel + Cunningham, again with the help and testing of many others, not the + least of whom are Bernard Blackham and Michael Frank. At its heart, + however, the operation is essentially the same as Gabor's version. + +2. Overview of operation. + + The basic sequence of operations is as follows: + + a. Quiesce all other activity. + b. Ensure enough memory and storage space are available, and attempt + to free memory/storage if necessary. + c. Allocate the required memory and storage space. + d. Write the image. + e. Power down. + + There are a number of complicating factors which mean that things are + not as simple as the above would imply, however... + + o The activity of each process must be stopped at a point where it will + not be holding locks necessary for saving the image, or unexpectedly + restart operations due to something like a timeout and thereby make + our image inconsistent. + + o It is desirous that we sync outstanding I/O to disk before calculating + image statistics. This reduces corruption if one should suspend but + then not resume, and also makes later parts of the operation safer (see + below). + + o We need to get as close as we can to an atomic copy of the data. + Inconsistencies in the image will result in inconsistent memory contents at + resume time, and thus in instability of the system and/or file system + corruption. This would appear to imply a maximum image size of one half of + the amount of RAM, but we have a solution... (again, below). + + o In 2.6 and later, we choose to play nicely with the other suspend-to-disk + implementations. + +3. Detailed description of internals. + + a. Quiescing activity. + + Safely quiescing the system is achieved using three separate but related + aspects. + + First, we use the vanilla kerne's support for freezing processes. This code + is based on the observation that the vast majority of processes don't need + to run during suspend. They can be 'frozen'. The kernel therefore + implements a refrigerator routine, which processes enter and in which they + remain until the cycle is complete. Processes enter the refrigerator via + try_to_freeze() invocations at appropriate places. A process cannot be + frozen in any old place. It must not be holding locks that will be needed + for writing the image or freezing other processes. For this reason, + userspace processes generally enter the refrigerator via the signal + handling code, and kernel threads at the place in their event loops where + they drop locks and yield to other processes or sleep. The task of freezing + processes is complicated by the fact that there can be interdependencies + between processes. Freezing process A before process B may mean that + process B cannot be frozen, because it stops at waiting for process A + rather than in the refrigerator. This issue is seen where userspace waits + on freezeable kernel threads or fuse filesystem threads. To address this + issue, we implement the following algorithm for quiescing activity: + + - Freeze filesystems (including fuse - userspace programs starting + new requests are immediately frozen; programs already running + requests complete their work before being frozen in the next + step) + - Freeze userspace + - Thaw filesystems (this is safe now that userspace is frozen and no + fuse requests are outstanding). + - Invoke sys_sync (noop on fuse). + - Freeze filesystems + - Freeze kernel threads + + If we need to free memory, we thaw kernel threads and filesystems, but not + userspace. We can then free caches without worrying about deadlocks due to + swap files being on frozen filesystems or such like. + + b. Ensure enough memory & storage are available. + + We have a number of constraints to meet in order to be able to successfully + suspend and resume. + + First, the image will be written in two parts, described below. One of + these parts needs to have an atomic copy made, which of course implies a + maximum size of one half of the amount of system memory. The other part + ('pageset') is not atomically copied, and can therefore be as large or + small as desired. + + Second, we have constraints on the amount of storage available. In these + calculations, we may also consider any compression that will be done. The + cryptoapi module allows the user to configure an expected compression ratio. + + Third, the user can specify an arbitrary limit on the image size, in + megabytes. This limit is treated as a soft limit, so that we don't fail the + attempt to suspend if we cannot meet this constraint. + + c. Allocate the required memory and storage space. + + Having done the initial freeze, we determine whether the above constraints + are met, and seek to allocate the metadata for the image. If the constraints + are not met, or we fail to allocate the required space for the metadata, we + seek to free the amount of memory that we calculate is needed and try again. + We allow up to four iterations of this loop before aborting the cycle. If + we do fail, it should only be because of a bug in TuxOnIce's calculations + or the vanilla kernel code for freeing memory. + + These steps are merged together in the prepare_image function, found in + prepare_image.c. The functions are merged because of the cyclical nature + of the problem of calculating how much memory and storage is needed. Since + the data structures containing the information about the image must + themselves take memory and use storage, the amount of memory and storage + required changes as we prepare the image. Since the changes are not large, + only one or two iterations will be required to achieve a solution. + + The recursive nature of the algorithm is miminised by keeping user space + frozen while preparing the image, and by the fact that our records of which + pages are to be saved and which pageset they are saved in use bitmaps (so + that changes in number or fragmentation of the pages to be saved don't + feedback via changes in the amount of memory needed for metadata). The + recursiveness is thus limited to any extra slab pages allocated to store the + extents that record storage used, and the effects of seeking to free memory. + + d. Write the image. + + We previously mentioned the need to create an atomic copy of the data, and + the half-of-memory limitation that is implied in this. This limitation is + circumvented by dividing the memory to be saved into two parts, called + pagesets. + + Pageset2 contains most of the page cache - the pages on the active and + inactive LRU lists that aren't needed or modified while TuxOnIce is + running, so they can be safely written without an atomic copy. They are + therefore saved first and reloaded last. While saving these pages, + TuxOnIce carefully ensures that the work of writing the pages doesn't make + the image inconsistent. With the support for Kernel (Video) Mode Setting + going into the kernel at the time of writing, we need to check for pages + on the LRU that are used by KMS, and exclude them from pageset2. They are + atomically copied as part of pageset 1. + + Once pageset2 has been saved, we prepare to do the atomic copy of remaining + memory. As part of the preparation, we power down drivers, thereby providing + them with the opportunity to have their state recorded in the image. The + amount of memory allocated by drivers for this is usually negligible, but if + DRI is in use, video drivers may require significants amounts. Ideally we + would be able to query drivers while preparing the image as to the amount of + memory they will need. Unfortunately no such mechanism exists at the time of + writing. For this reason, TuxOnIce allows the user to set an + 'extra_pages_allowance', which is used to seek to ensure sufficient memory + is available for drivers at this point. TuxOnIce also lets the user set this + value to 0. In this case, a test driver suspend is done while preparing the + image, and the difference (plus a margin) used instead. TuxOnIce will also + automatically restart the hibernation process (twice at most) if it finds + that the extra pages allowance is not sufficient. It will then use what was + actually needed (plus a margin, again). Failure to hibernate should thus + be an extremely rare occurence. + + Having suspended the drivers, we save the CPU context before making an + atomic copy of pageset1, resuming the drivers and saving the atomic copy. + After saving the two pagesets, we just need to save our metadata before + powering down. + + As we mentioned earlier, the contents of pageset2 pages aren't needed once + they've been saved. We therefore use them as the destination of our atomic + copy. In the unlikely event that pageset1 is larger, extra pages are + allocated while the image is being prepared. This is normally only a real + possibility when the system has just been booted and the page cache is + small. + + This is where we need to be careful about syncing, however. Pageset2 will + probably contain filesystem meta data. If this is overwritten with pageset1 + and then a sync occurs, the filesystem will be corrupted - at least until + resume time and another sync of the restored data. Since there is a + possibility that the user might not resume or (may it never be!) that + TuxOnIce might oops, we do our utmost to avoid syncing filesystems after + copying pageset1. + + e. Incremental images + + TuxOnIce 4.0 introduces a new incremental image mode which changes things a + little. When incremental images are enabled, we save a 'normal' image the + first time we hibernate. One resume however, we do not free the image or + the associated storage. Instead, it is retained until the next attempt at + hibernating and a mechanism is enabled which is used to track which pages + of memory are modified between the two cycles. The modified pages can then + be added to the existing image, rather than unmodified pages being saved + again unnecessarily. + + Incremental image support is available in 64 bit Linux only, due to the + requirement for extra page flags. + + This support is accomplished in the following way: + + 1) Tracking of pages. + + The tracking of changed pages is accomplished using the page fault + mechanism. When we reach a point at which we want to start tracking + changes, most pages are marked read-only and also flagged as being + read-only because of this support. Since this cannot happen for every page + of RAM, some are marked as untracked and always treated as modified whn + preparing an incremental iamge. When a process attempts to modify a page + that is marked read-only in this way, a page fault occurs, with TuxOnIce + code marking the page writable and dirty before allowing the write to + continue. In this way, the effect of incremental images on performance is + minimised - a page only causes a fault once. Small modifications to the + page allocator further reduce the number of faults that occur - free pages + are not tracked; they are made writable and marked as dirty as part of + being allocated. + + 2) Saving the incremental image / atomicity. + + The page fault mechanism is also used to improve the means by which + atomicity of the image is acheived. When it is time to do an atomic copy, + the flags for pages are reset, with the result being that it is no longer + necessary for us to do an atomic of pageset1. Instead, we normally write + the uncopied pages to disk. When an attempt is made to modify a page that + has not yet been saved, the page-fault mechanism makes a copy of the page + prior to allowing the write. This copy is then written to disk. Likewise, + on resume, if a process attempts to write to a page that has been read + while the rest of the image is still being loaded, a copy of that page is + made prior to the write being allowed. At the end of loading the image, + modified pages can thus be restored to their 'atomic copy' contents prior + to restarting normal operation. We also mark pages that are yet to be read + as invalid PFNs, so that we can capture as a bug any attempt by a + half-restored kernel to access a page that hasn't yet been reloaded. + + f. Power down. + + Powering down uses standard kernel routines. TuxOnIce supports powering down + using the ACPI S3, S4 and S5 methods or the kernel's non-ACPI power-off. + Supporting suspend to ram (S3) as a power off option might sound strange, + but it allows the user to quickly get their system up and running again if + the battery doesn't run out (we just need to re-read the overwritten pages) + and if the battery does run out (or the user removes power), they can still + resume. + +4. Data Structures. + + TuxOnIce uses three main structures to store its metadata and configuration + information: + + a) Pageflags bitmaps. + + TuxOnIce records which pages will be in pageset1, pageset2, the destination + of the atomic copy and the source of the atomically restored image using + bitmaps. The code used is that written for swsusp, with small improvements + to match TuxOnIce's requirements. + + The pageset1 bitmap is thus easily stored in the image header for use at + resume time. + + As mentioned above, using bitmaps also means that the amount of memory and + storage required for recording the above information is constant. This + greatly simplifies the work of preparing the image. In earlier versions of + TuxOnIce, extents were used to record which pages would be stored. In that + case, however, eating memory could result in greater fragmentation of the + lists of pages, which in turn required more memory to store the extents and + more storage in the image header. These could in turn require further + freeing of memory, and another iteration. All of this complexity is removed + by having bitmaps. + + Bitmaps also make a lot of sense because TuxOnIce only ever iterates + through the lists. There is therefore no cost to not being able to find the + nth page in order 0 time. We only need to worry about the cost of finding + the n+1th page, given the location of the nth page. Bitwise optimisations + help here. + + b) Extents for block data. + + TuxOnIce supports writing the image to multiple block devices. In the case + of swap, multiple partitions and/or files may be in use, and we happily use + them all (with the exception of compcache pages, which we allocate but do + not use). This use of multiple block devices is accomplished as follows: + + Whatever the actual source of the allocated storage, the destination of the + image can be viewed in terms of one or more block devices, and on each + device, a list of sectors. To simplify matters, we only use contiguous, + PAGE_SIZE aligned sectors, like the swap code does. + + Since sector numbers on each bdev may well not start at 0, it makes much + more sense to use extents here. Contiguous ranges of pages can thus be + represented in the extents by contiguous values. + + Variations in block size are taken account of in transforming this data + into the parameters for bio submission. + + We can thus implement a layer of abstraction wherein the core of TuxOnIce + doesn't have to worry about which device we're currently writing to or + where in the device we are. It simply requests that the next page in the + pageset or header be written, leaving the details to this lower layer. + The lower layer remembers where in the sequence of devices and blocks each + pageset starts. The header always starts at the beginning of the allocated + storage. + + So extents are: + + struct extent { + unsigned long minimum, maximum; + struct extent *next; + } + + These are combined into chains of extents for a device: + + struct extent_chain { + int size; /* size of the extent ie sum (max-min+1) */ + int allocs, frees; + char *name; + struct extent *first, *last_touched; + }; + + For each bdev, we need to store a little more info (simplified definition): + + struct toi_bdev_info { + struct block_device *bdev; + + char uuid[17]; + dev_t dev_t; + int bmap_shift; + int blocks_per_page; + }; + + The uuid is the main means used to identify the device in the storage + image. This means we can cope with the dev_t representation of a device + changing between saving the image and restoring it, as may happen on some + bioses or in the LVM case. + + bmap_shift and blocks_per_page apply the effects of variations in blocks + per page settings for the filesystem and underlying bdev. For most + filesystems, these are the same, but for xfs, they can have independant + values. + + Combining these two structures together, we have everything we need to + record what devices and what blocks on each device are being used to + store the image, and to submit i/o using bio_submit. + + The last elements in the picture are a means of recording how the storage + is being used. + + We do this first and foremost by implementing a layer of abstraction on + top of the devices and extent chains which allows us to view however many + devices there might be as one long storage tape, with a single 'head' that + tracks a 'current position' on the tape: + + struct extent_iterate_state { + struct extent_chain *chains; + int num_chains; + int current_chain; + struct extent *current_extent; + unsigned long current_offset; + }; + + That is, *chains points to an array of size num_chains of extent chains. + For the filewriter, this is always a single chain. For the swapwriter, the + array is of size MAX_SWAPFILES. + + current_chain, current_extent and current_offset thus point to the current + index in the chains array (and into a matching array of struct + suspend_bdev_info), the current extent in that chain (to optimise access), + and the current value in the offset. + + The image is divided into three parts: + - The header + - Pageset 1 + - Pageset 2 + + The header always starts at the first device and first block. We know its + size before we begin to save the image because we carefully account for + everything that will be stored in it. + + The second pageset (LRU) is stored first. It begins on the next page after + the end of the header. + + The first pageset is stored second. It's start location is only known once + pageset2 has been saved, since pageset2 may be compressed as it is written. + This location is thus recorded at the end of saving pageset2. It is page + aligned also. + + Since this information is needed at resume time, and the location of extents + in memory will differ at resume time, this needs to be stored in a portable + way: + + struct extent_iterate_saved_state { + int chain_num; + int extent_num; + unsigned long offset; + }; + + We can thus implement a layer of abstraction wherein the core of TuxOnIce + doesn't have to worry about which device we're currently writing to or + where in the device we are. It simply requests that the next page in the + pageset or header be written, leaving the details to this layer, and + invokes the routines to remember and restore the position, without having + to worry about the details of how the data is arranged on disk or such like. + + c) Modules + + One aim in designing TuxOnIce was to make it flexible. We wanted to allow + for the implementation of different methods of transforming a page to be + written to disk and different methods of getting the pages stored. + + In early versions (the betas and perhaps Suspend1), compression support was + inlined in the image writing code, and the data structures and code for + managing swap were intertwined with the rest of the code. A number of people + had expressed interest in implementing image encryption, and alternative + methods of storing the image. + + In order to achieve this, TuxOnIce was given a modular design. + + A module is a single file which encapsulates the functionality needed + to transform a pageset of data (encryption or compression, for example), + or to write the pageset to a device. The former type of module is called + a 'page-transformer', the later a 'writer'. + + Modules are linked together in pipeline fashion. There may be zero or more + page transformers in a pipeline, and there is always exactly one writer. + The pipeline follows this pattern: + + --------------------------------- + | TuxOnIce Core | + --------------------------------- + | + | + --------------------------------- + | Page transformer 1 | + --------------------------------- + | + | + --------------------------------- + | Page transformer 2 | + --------------------------------- + | + | + --------------------------------- + | Writer | + --------------------------------- + + During the writing of an image, the core code feeds pages one at a time + to the first module. This module performs whatever transformations it + implements on the incoming data, completely consuming the incoming data and + feeding output in a similar manner to the next module. + + All routines are SMP safe, and the final result of the transformations is + written with an index (provided by the core) and size of the output by the + writer. As a result, we can have multithreaded I/O without needing to + worry about the sequence in which pages are written (or read). + + During reading, the pipeline works in the reverse direction. The core code + calls the first module with the address of a buffer which should be filled. + (Note that the buffer size is always PAGE_SIZE at this time). This module + will in turn request data from the next module and so on down until the + writer is made to read from the stored image. + + Part of definition of the structure of a module thus looks like this: + + int (*rw_init) (int rw, int stream_number); + int (*rw_cleanup) (int rw); + int (*write_chunk) (struct page *buffer_page); + int (*read_chunk) (struct page *buffer_page, int sync); + + It should be noted that the _cleanup routine may be called before the + full stream of data has been read or written. While writing the image, + the user may (depending upon settings) choose to abort suspending, and + if we are in the midst of writing the last portion of the image, a portion + of the second pageset may be reread. This may also happen if an error + occurs and we seek to abort the process of writing the image. + + The modular design is also useful in a number of other ways. It provides + a means where by we can add support for: + + - providing overall initialisation and cleanup routines; + - serialising configuration information in the image header; + - providing debugging information to the user; + - determining memory and image storage requirements; + - dis/enabling components at run-time; + - configuring the module (see below); + + ...and routines for writers specific to their work: + - Parsing a resume= location; + - Determining whether an image exists; + - Marking a resume as having been attempted; + - Invalidating an image; + + Since some parts of the core - the user interface and storage manager + support - have use for some of these functions, they are registered as + 'miscellaneous' modules as well. + + d) Sysfs data structures. + + This brings us naturally to support for configuring TuxOnIce. We desired to + provide a way to make TuxOnIce as flexible and configurable as possible. + The user shouldn't have to reboot just because they want to now hibernate to + a file instead of a partition, for example. + + To accomplish this, TuxOnIce implements a very generic means whereby the + core and modules can register new sysfs entries. All TuxOnIce entries use + a single _store and _show routine, both of which are found in + tuxonice_sysfs.c in the kernel/power directory. These routines handle the + most common operations - getting and setting the values of bits, integers, + longs, unsigned longs and strings in one place, and allow overrides for + customised get and set options as well as side-effect routines for all + reads and writes. + + When combined with some simple macros, a new sysfs entry can then be defined + in just a couple of lines: + + SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1, + 2048, 0, NULL), + + This defines a sysfs entry named "progress_granularity" which is rw and + allows the user to access an integer stored at &progress_granularity, giving + it a value between 1 and 2048 inclusive. + + Sysfs entries are registered under /sys/power/tuxonice, and entries for + modules are located in a subdirectory named after the module. + diff -Nur linux-4.2/Documentation/power/tuxonice.txt linux-4.2-pck/Documentation/power/tuxonice.txt --- linux-4.2/Documentation/power/tuxonice.txt 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/power/tuxonice.txt 2015-09-08 11:20:31.793711515 -0300 @@ -0,0 +1,948 @@ + --- TuxOnIce, version 3.0 --- + +1. What is it? +2. Why would you want it? +3. What do you need to use it? +4. Why not just use the version already in the kernel? +5. How do you use it? +6. What do all those entries in /sys/power/tuxonice do? +7. How do you get support? +8. I think I've found a bug. What should I do? +9. When will XXX be supported? +10 How does it work? +11. Who wrote TuxOnIce? + +1. What is it? + + Imagine you're sitting at your computer, working away. For some reason, you + need to turn off your computer for a while - perhaps it's time to go home + for the day. When you come back to your computer next, you're going to want + to carry on where you left off. Now imagine that you could push a button and + have your computer store the contents of its memory to disk and power down. + Then, when you next start up your computer, it loads that image back into + memory and you can carry on from where you were, just as if you'd never + turned the computer off. You have far less time to start up, no reopening of + applications or finding what directory you put that file in yesterday. + That's what TuxOnIce does. + + TuxOnIce has a long heritage. It began life as work by Gabor Kuti, who, + with some help from Pavel Machek, got an early version going in 1999. The + project was then taken over by Florent Chabaud while still in alpha version + numbers. Nigel Cunningham came on the scene when Florent was unable to + continue, moving the project into betas, then 1.0, 2.0 and so on up to + the present series. During the 2.0 series, the name was contracted to + Suspend2 and the website suspend2.net created. Beginning around July 2007, + a transition to calling the software TuxOnIce was made, to seek to help + make it clear that TuxOnIce is more concerned with hibernation than suspend + to ram. + + Pavel Machek's swsusp code, which was merged around 2.5.17 retains the + original name, and was essentially a fork of the beta code until Rafael + Wysocki came on the scene in 2005 and began to improve it further. + +2. Why would you want it? + + Why wouldn't you want it? + + Being able to save the state of your system and quickly restore it improves + your productivity - you get a useful system in far less time than through + the normal boot process. You also get to be completely 'green', using zero + power, or as close to that as possible (the computer may still provide + minimal power to some devices, so they can initiate a power on, but that + will be the same amount of power as would be used if you told the computer + to shutdown. + +3. What do you need to use it? + + a. Kernel Support. + + i) The TuxOnIce patch. + + TuxOnIce is part of the Linux Kernel. This version is not part of Linus's + 2.6 tree at the moment, so you will need to download the kernel source and + apply the latest patch. Having done that, enable the appropriate options in + make [menu|x]config (under Power Management Options - look for "Enhanced + Hibernation"), compile and install your kernel. TuxOnIce works with SMP, + Highmem, preemption, fuse filesystems, x86-32, PPC and x86_64. + + TuxOnIce patches are available from http://tuxonice.net. + + ii) Compression support. + + Compression support is implemented via the cryptoapi. You will therefore want + to select any Cryptoapi transforms that you want to use on your image from + the Cryptoapi menu while configuring your kernel. We recommend the use of the + LZO compression method - it is very fast and still achieves good compression. + + You can also tell TuxOnIce to write its image to an encrypted and/or + compressed filesystem/swap partition. In that case, you don't need to do + anything special for TuxOnIce when it comes to kernel configuration. + + iii) Configuring other options. + + While you're configuring your kernel, try to configure as much as possible + to build as modules. We recommend this because there are a number of drivers + that are still in the process of implementing proper power management + support. In those cases, the best way to work around their current lack is + to build them as modules and remove the modules while hibernating. You might + also bug the driver authors to get their support up to speed, or even help! + + b. Storage. + + i) Swap. + + TuxOnIce can store the hibernation image in your swap partition, a swap file or + a combination thereof. Whichever combination you choose, you will probably + want to create enough swap space to store the largest image you could have, + plus the space you'd normally use for swap. A good rule of thumb would be + to calculate the amount of swap you'd want without using TuxOnIce, and then + add the amount of memory you have. This swapspace can be arranged in any way + you'd like. It can be in one partition or file, or spread over a number. The + only requirement is that they be active when you start a hibernation cycle. + + There is one exception to this requirement. TuxOnIce has the ability to turn + on one swap file or partition at the start of hibernating and turn it back off + at the end. If you want to ensure you have enough memory to store a image + when your memory is fully used, you might want to make one swap partition or + file for 'normal' use, and another for TuxOnIce to activate & deactivate + automatically. (Further details below). + + ii) Normal files. + + TuxOnIce includes a 'file allocator'. The file allocator can store your + image in a simple file. Since Linux has the concept of everything being a + file, this is more powerful than it initially sounds. If, for example, you + were to set up a network block device file, you could hibernate to a network + server. This has been tested and works to a point, but nbd itself isn't + stateless enough for our purposes. + + Take extra care when setting up the file allocator. If you just type + commands without thinking and then try to hibernate, you could cause + irreversible corruption on your filesystems! Make sure you have backups. + + Most people will only want to hibernate to a local file. To achieve that, do + something along the lines of: + + echo "TuxOnIce" > /hibernation-file + dd if=/dev/zero bs=1M count=512 >> /hibernation-file + + This will create a 512MB file called /hibernation-file. To get TuxOnIce to use + it: + + echo /hibernation-file > /sys/power/tuxonice/file/target + + Then + + cat /sys/power/tuxonice/resume + + Put the results of this into your bootloader's configuration (see also step + C, below): + + ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE--- + # cat /sys/power/tuxonice/resume + file:/dev/hda2:0x1e001 + + In this example, we would edit the append= line of our lilo.conf|menu.lst + so that it included: + + resume=file:/dev/hda2:0x1e001 + ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE--- + + For those who are thinking 'Could I make the file sparse?', the answer is + 'No!'. At the moment, there is no way for TuxOnIce to fill in the holes in + a sparse file while hibernating. In the longer term (post merge!), I'd like + to change things so that the file could be dynamically resized and have + holes filled as needed. Right now, however, that's not possible and not a + priority. + + c. Bootloader configuration. + + Using TuxOnIce also requires that you add an extra parameter to + your lilo.conf or equivalent. Here's an example for a swap partition: + + append="resume=swap:/dev/hda1" + + This would tell TuxOnIce that /dev/hda1 is a swap partition you + have. TuxOnIce will use the swap signature of this partition as a + pointer to your data when you hibernate. This means that (in this example) + /dev/hda1 doesn't need to be _the_ swap partition where all of your data + is actually stored. It just needs to be a swap partition that has a + valid signature. + + You don't need to have a swap partition for this purpose. TuxOnIce + can also use a swap file, but usage is a little more complex. Having made + your swap file, turn it on and do + + cat /sys/power/tuxonice/swap/headerlocations + + (this assumes you've already compiled your kernel with TuxOnIce + support and booted it). The results of the cat command will tell you + what you need to put in lilo.conf: + + For swap partitions like /dev/hda1, simply use resume=/dev/hda1. + For swapfile `swapfile`, use resume=swap:/dev/hda2:0x242d. + + If the swapfile changes for any reason (it is moved to a different + location, it is deleted and recreated, or the filesystem is + defragmented) then you will have to check + /sys/power/tuxonice/swap/headerlocations for a new resume_block value. + + Once you've compiled and installed the kernel and adjusted your bootloader + configuration, you should only need to reboot for the most basic part + of TuxOnIce to be ready. + + If you only compile in the swap allocator, or only compile in the file + allocator, you don't need to add the "swap:" part of the resume= + parameters above. resume=/dev/hda2:0x242d will work just as well. If you + have compiled both and your storage is on swap, you can also use this + format (the swap allocator is the default allocator). + + When compiling your kernel, one of the options in the 'Power Management + Support' menu, just above the 'Enhanced Hibernation (TuxOnIce)' entry is + called 'Default resume partition'. This can be used to set a default value + for the resume= parameter. + + d. The hibernate script. + + Since the driver model in 2.6 kernels is still being developed, you may need + to do more than just configure TuxOnIce. Users of TuxOnIce usually start the + process via a script which prepares for the hibernation cycle, tells the + kernel to do its stuff and then restore things afterwards. This script might + involve: + + - Switching to a text console and back if X doesn't like the video card + status on resume. + - Un/reloading drivers that don't play well with hibernation. + + Note that you might not be able to unload some drivers if there are + processes using them. You might have to kill off processes that hold + devices open. Hint: if your X server accesses an USB mouse, doing a + 'chvt' to a text console releases the device and you can unload the + module. + + Check out the latest script (available on tuxonice.net). + + e. The userspace user interface. + + TuxOnIce has very limited support for displaying status if you only apply + the kernel patch - it can printk messages, but that is all. In addition, + some of the functions mentioned in this document (such as cancelling a cycle + or performing interactive debugging) are unavailable. To utilise these + functions, or simply get a nice display, you need the 'userui' component. + Userui comes in three flavours, usplash, fbsplash and text. Text should + work on any console. Usplash and fbsplash require the appropriate + (distro specific?) support. + + To utilise a userui, TuxOnIce just needs to be told where to find the + userspace binary: + + echo "/usr/local/sbin/tuxoniceui_fbsplash" > /sys/power/tuxonice/user_interface/program + + The hibernate script can do this for you, and a default value for this + setting can be configured when compiling the kernel. This path is also + stored in the image header, so if you have an initrd or initramfs, you can + use the userui during the first part of resuming (prior to the atomic + restore) by putting the binary in the same path in your initrd/ramfs. + Alternatively, you can put it in a different location and do an echo + similar to the above prior to the echo > do_resume. The value saved in the + image header will then be ignored. + +4. Why not just use the version already in the kernel? + + The version in the vanilla kernel has a number of drawbacks. The most + serious of these are: + - it has a maximum image size of 1/2 total memory; + - it doesn't allocate storage until after it has snapshotted memory. + This means that you can't be sure hibernating will work until you + see it start to write the image; + - it does not allow you to press escape to cancel a cycle; + - it does not allow you to press escape to cancel resuming; + - it does not allow you to automatically swapon a file when + starting a cycle; + - it does not allow you to use multiple swap partitions or files; + - it does not allow you to use ordinary files; + - it just invalidates an image and continues to boot if you + accidentally boot the wrong kernel after hibernating; + - it doesn't support any sort of nice display while hibernating; + - it is moving toward requiring that you have an initrd/initramfs + to ever have a hope of resuming (uswsusp). While uswsusp will + address some of the concerns above, it won't address all of them, + and will be more complicated to get set up; + - it doesn't have support for suspend-to-both (write a hibernation + image, then suspend to ram; I think this is known as ReadySafe + under M$). + +5. How do you use it? + + A hibernation cycle can be started directly by doing: + + echo > /sys/power/tuxonice/do_hibernate + + In practice, though, you'll probably want to use the hibernate script + to unload modules, configure the kernel the way you like it and so on. + In that case, you'd do (as root): + + hibernate + + See the hibernate script's man page for more details on the options it + takes. + + If you're using the text or splash user interface modules, one feature of + TuxOnIce that you might find useful is that you can press Escape at any time + during hibernating, and the process will be aborted. + + Due to the way hibernation works, this means you'll have your system back and + perfectly usable almost instantly. The only exception is when it's at the + very end of writing the image. Then it will need to reload a small (usually + 4-50MBs, depending upon the image characteristics) portion first. + + Likewise, when resuming, you can press escape and resuming will be aborted. + The computer will then powerdown again according to settings at that time for + the powerdown method or rebooting. + + You can change the settings for powering down while the image is being + written by pressing 'R' to toggle rebooting and 'O' to toggle between + suspending to ram and powering down completely). + + If you run into problems with resuming, adding the "noresume" option to + the kernel command line will let you skip the resume step and recover your + system. This option shouldn't normally be needed, because TuxOnIce modifies + the image header prior to the atomic restore, and will thus prompt you + if it detects that you've tried to resume an image before (this flag is + removed if you press Escape to cancel a resume, so you won't be prompted + then). + + Recent kernels (2.6.24 onwards) add support for resuming from a different + kernel to the one that was hibernated (thanks to Rafael for his work on + this - I've just embraced and enhanced the support for TuxOnIce). This + should further reduce the need for you to use the noresume option. + +6. What do all those entries in /sys/power/tuxonice do? + + /sys/power/tuxonice is the directory which contains files you can use to + tune and configure TuxOnIce to your liking. The exact contents of + the directory will depend upon the version of TuxOnIce you're + running and the options you selected at compile time. In the following + descriptions, names in brackets refer to compile time options. + (Note that they're all dependant upon you having selected CONFIG_TUXONICE + in the first place!). + + Since the values of these settings can open potential security risks, the + writeable ones are accessible only to the root user. You may want to + configure sudo to allow you to invoke your hibernate script as an ordinary + user. + + - alloc/failure_test + + This debugging option provides a way of testing TuxOnIce's handling of + memory allocation failures. Each allocation type that TuxOnIce makes has + been given a unique number (see the source code). Echo the appropriate + number into this entry, and when TuxOnIce attempts to do that allocation, + it will pretend there was a failure and act accordingly. + + - alloc/find_max_mem_allocated + + This debugging option will cause TuxOnIce to find the maximum amount of + memory it used during a cycle, and report that information in debugging + information at the end of the cycle. + + - alt_resume_param + + Instead of powering down after writing a hibernation image, TuxOnIce + supports resuming from a different image. This entry lets you set the + location of the signature for that image (the resume= value you'd use + for it). Using an alternate image and keep_image mode, you can do things + like using an alternate image to power down an uninterruptible power + supply. + + - block_io/target_outstanding_io + + This value controls the amount of memory that the block I/O code says it + needs when the core code is calculating how much memory is needed for + hibernating and for resuming. It doesn't directly control the amount of + I/O that is submitted at any one time - that depends on the amount of + available memory (we may have more available than we asked for), the + throughput that is being achieved and the ability of the CPU to keep up + with disk throughput (particularly where we're compressing pages). + + - checksum/enabled + + Use cryptoapi hashing routines to verify that Pageset2 pages don't change + while we're saving the first part of the image, and to get any pages that + do change resaved in the atomic copy. This should normally not be needed, + but if you're seeing issues, please enable this. If your issues stop you + being able to resume, enable this option, hibernate and cancel the cycle + after the atomic copy is done. If the debugging info shows a non-zero + number of pages resaved, please report this to Nigel. + + - compression/algorithm + + Set the cryptoapi algorithm used for compressing the image. + + - compression/expected_compression + + These values allow you to set an expected compression ratio, which TuxOnice + will use in calculating whether it meets constraints on the image size. If + this expected compression ratio is not attained, the hibernation cycle will + abort, so it is wise to allow some spare. You can see what compression + ratio is achieved in the logs after hibernating. + + - debug_info: + + This file returns information about your configuration that may be helpful + in diagnosing problems with hibernating. + + - did_suspend_to_both: + + This file can be used when you hibernate with powerdown method 3 (ie suspend + to ram after writing the image). There can be two outcomes in this case. We + can resume from the suspend-to-ram before the battery runs out, or we can run + out of juice and and up resuming like normal. This entry lets you find out, + post resume, which way we went. If the value is 1, we resumed from suspend + to ram. This can be useful when actions need to be run post suspend-to-ram + that don't need to be run if we did the normal resume from power off. + + - do_hibernate: + + When anything is written to this file, the kernel side of TuxOnIce will + begin to attempt to write an image to disk and power down. You'll normally + want to run the hibernate script instead, to get modules unloaded first. + + - do_resume: + + When anything is written to this file TuxOnIce will attempt to read and + restore an image. If there is no image, it will return almost immediately. + If an image exists, the echo > will never return. Instead, the original + kernel context will be restored and the original echo > do_hibernate will + return. + + - */enabled + + These option can be used to temporarily disable various parts of TuxOnIce. + + - extra_pages_allowance + + When TuxOnIce does its atomic copy, it calls the driver model suspend + and resume methods. If you have DRI enabled with a driver such as fglrx, + this can result in the driver allocating a substantial amount of memory + for storing its state. Extra_pages_allowance tells TuxOnIce how much + extra memory it should ensure is available for those allocations. If + your attempts at hibernating end with a message in dmesg indicating that + insufficient extra pages were allowed, you need to increase this value. + + - file/target: + + Read this value to get the current setting. Write to it to point TuxOnice + at a new storage location for the file allocator. See section 3.b.ii above + for details of how to set up the file allocator. + + - freezer_test + + This entry can be used to get TuxOnIce to just test the freezer and prepare + an image without actually doing a hibernation cycle. It is useful for + diagnosing freezing and image preparation issues. + + - full_pageset2 + + TuxOnIce divides the pages that are stored in an image into two sets. The + difference between the two sets is that pages in pageset 1 are atomically + copied, and pages in pageset 2 are written to disk without being copied + first. A page CAN be written to disk without being copied first if and only + if its contents will not be modified or used at any time after userspace + processes are frozen. A page MUST be in pageset 1 if its contents are + modified or used at any time after userspace processes have been frozen. + + Normally (ie if this option is enabled), TuxOnIce will put all pages on the + per-zone LRUs in pageset2, then remove those pages used by any userspace + user interface helper and TuxOnIce storage manager that are running, + together with pages used by the GEM memory manager introduced around 2.6.28 + kernels. + + If this option is disabled, a much more conservative approach will be taken. + The only pages in pageset2 will be those belonging to userspace processes, + with the exclusion of those belonging to the TuxOnIce userspace helpers + mentioned above. This will result in a much smaller pageset2, and will + therefore result in smaller images than are possible with this option + enabled. + + - ignore_rootfs + + TuxOnIce records which device is mounted as the root filesystem when + writing the hibernation image. It will normally check at resume time that + this device isn't already mounted - that would be a cause of filesystem + corruption. In some particular cases (RAM based root filesystems), you + might want to disable this check. This option allows you to do that. + + - image_exists: + + Can be used in a script to determine whether a valid image exists at the + location currently pointed to by resume=. Returns up to three lines. + The first is whether an image exists (-1 for unsure, otherwise 0 or 1). + If an image eixsts, additional lines will return the machine and version. + Echoing anything to this entry removes any current image. + + - image_size_limit: + + The maximum size of hibernation image written to disk, measured in megabytes + (1024*1024). + + - last_result: + + The result of the last hibernation cycle, as defined in + include/linux/suspend-debug.h with the values SUSPEND_ABORTED to + SUSPEND_KEPT_IMAGE. This is a bitmask. + + - late_cpu_hotplug: + + This sysfs entry controls whether cpu hotplugging is done - as normal - just + before (unplug) and after (replug) the atomic copy/restore (so that all + CPUs/cores are available for multithreaded I/O). The alternative is to + unplug all secondary CPUs/cores at the start of hibernating/resuming, and + replug them at the end of resuming. No multithreaded I/O will be possible in + this configuration, but the odd machine has been reported to require it. + + - lid_file: + + This determines which ACPI button file we look in to determine whether the + lid is open or closed after resuming from suspend to disk or power off. + If the entry is set to "lid/LID", we'll open /proc/acpi/button/lid/LID/state + and check its contents at the appropriate moment. See post_wake_state below + for more details on how this entry is used. + + - log_everything (CONFIG_PM_DEBUG): + + Setting this option results in all messages printed being logged. Normally, + only a subset are logged, so as to not slow the process and not clutter the + logs. Useful for debugging. It can be toggled during a cycle by pressing + 'L'. + + - no_load_direct: + + This is a debugging option. If, when loading the atomically copied pages of + an image, TuxOnIce finds that the destination address for a page is free, + it will normally allocate the image, load the data directly into that + address and skip it in the atomic restore. If this option is disabled, the + page will be loaded somewhere else and atomically restored like other pages. + + - no_flusher_thread: + + When doing multithreaded I/O (see below), the first online CPU can be used + to _just_ submit compressed pages when writing the image, rather than + compressing and submitting data. This option is normally disabled, but has + been included because Nigel would like to see whether it will be more useful + as the number of cores/cpus in computers increases. + + - no_multithreaded_io: + + TuxOnIce will normally create one thread per cpu/core on your computer, + each of which will then perform I/O. This will generally result in + throughput that's the maximum the storage medium can handle. There + shouldn't be any reason to disable multithreaded I/O now, but this option + has been retained for debugging purposes. + + - no_pageset2 + + See the entry for full_pageset2 above for an explanation of pagesets. + Enabling this option causes TuxOnIce to do an atomic copy of all pages, + thereby limiting the maximum image size to 1/2 of memory, as swsusp does. + + - no_pageset2_if_unneeded + + See the entry for full_pageset2 above for an explanation of pagesets. + Enabling this option causes TuxOnIce to act like no_pageset2 was enabled + if and only it isn't needed anyway. This option may still make TuxOnIce + less reliable because pageset2 pages are normally used to store the + atomic copy - drivers that want to do allocations of larger amounts of + memory in one shot will be more likely to find that those amounts aren't + available if this option is enabled. + + - pause_between_steps (CONFIG_PM_DEBUG): + + This option is used during debugging, to make TuxOnIce pause between + each step of the process. It is ignored when the nice display is on. + + - post_wake_state: + + TuxOnIce provides support for automatically waking after a user-selected + delay, and using a different powerdown method if the lid is still closed. + (Yes, we're assuming a laptop). This entry lets you choose what state + should be entered next. The values are those described under + powerdown_method, below. It can be used to suspend to RAM after hibernating, + then powerdown properly (say) 20 minutes. It can also be used to power down + properly, then wake at (say) 6.30am and suspend to RAM until you're ready + to use the machine. + + - powerdown_method: + + Used to select a method by which TuxOnIce should powerdown after writing the + image. Currently: + + 0: Don't use ACPI to power off. + 3: Attempt to enter Suspend-to-ram. + 4: Attempt to enter ACPI S4 mode. + 5: Attempt to power down via ACPI S5 mode. + + Note that these options are highly dependant upon your hardware & software: + + 3: When succesful, your machine suspends to ram instead of powering off. + The advantage of using this mode is that it doesn't matter whether your + battery has enough charge to make it through to your next resume. If it + lasts, you will simply resume from suspend to ram (and the image on disk + will be discarded). If the battery runs out, you will resume from disk + instead. The disadvantage is that it takes longer than a normal + suspend-to-ram to enter the state, since the suspend-to-disk image needs + to be written first. + 4/5: When successful, your machine will be off and comsume (almost) no power. + But it might still react to some external events like opening the lid or + trafic on a network or usb device. For the bios, resume is then the same + as warm boot, similar to a situation where you used the command `reboot' + to reboot your machine. If your machine has problems on warm boot or if + you want to protect your machine with the bios password, this is probably + not the right choice. Mode 4 may be necessary on some machines where ACPI + wake up methods need to be run to properly reinitialise hardware after a + hibernation cycle. + 0: Switch the machine completely off. The only possible wakeup is the power + button. For the bios, resume is then the same as a cold boot, in + particular you would have to provide your bios boot password if your + machine uses that feature for booting. + + - progressbar_granularity_limit: + + This option can be used to limit the granularity of the progress bar + displayed with a bootsplash screen. The value is the maximum number of + steps. That is, 10 will make the progress bar jump in 10% increments. + + - reboot: + + This option causes TuxOnIce to reboot rather than powering down + at the end of saving an image. It can be toggled during a cycle by pressing + 'R'. + + - resume: + + This sysfs entry can be used to read and set the location in which TuxOnIce + will look for the signature of an image - the value set using resume= at + boot time or CONFIG_PM_STD_PARTITION ("Default resume partition"). By + writing to this file as well as modifying your bootloader's configuration + file (eg menu.lst), you can set or reset the location of your image or the + method of storing the image without rebooting. + + - replace_swsusp (CONFIG_TOI_REPLACE_SWSUSP): + + This option makes + + echo disk > /sys/power/state + + activate TuxOnIce instead of swsusp. Regardless of whether this option is + enabled, any invocation of swsusp's resume time trigger will cause TuxOnIce + to check for an image too. This is due to the fact that at resume time, we + can't know whether this option was enabled until we see if an image is there + for us to resume from. (And when an image exists, we don't care whether we + did replace swsusp anyway - we just want to resume). + + - resume_commandline: + + This entry can be read after resuming to see the commandline that was used + when resuming began. You might use this to set up two bootloader entries + that are the same apart from the fact that one includes a extra append= + argument "at_work=1". You could then grep resume_commandline in your + post-resume scripts and configure networking (for example) differently + depending upon whether you're at home or work. resume_commandline can be + set to arbitrary text if you wish to remove sensitive contents. + + - swap/swapfilename: + + This entry is used to specify the swapfile or partition that + TuxOnIce will attempt to swapon/swapoff automatically. Thus, if + I normally use /dev/hda1 for swap, and want to use /dev/hda2 for specifically + for my hibernation image, I would + + echo /dev/hda2 > /sys/power/tuxonice/swap/swapfile + + /dev/hda2 would then be automatically swapon'd and swapoff'd. Note that the + swapon and swapoff occur while other processes are frozen (including kswapd) + so this swap file will not be used up when attempting to free memory. The + parition/file is also given the highest priority, so other swapfiles/partitions + will only be used to save the image when this one is filled. + + The value of this file is used by headerlocations along with any currently + activated swapfiles/partitions. + + - swap/headerlocations: + + This option tells you the resume= options to use for swap devices you + currently have activated. It is particularly useful when you only want to + use a swap file to store your image. See above for further details. + + - test_bio + + This is a debugging option. When enabled, TuxOnIce will not hibernate. + Instead, when asked to write an image, it will skip the atomic copy, + just doing the writing of the image and then returning control to the + user at the point where it would have powered off. This is useful for + testing throughput in different configurations. + + - test_filter_speed + + This is a debugging option. When enabled, TuxOnIce will not hibernate. + Instead, when asked to write an image, it will not write anything or do + an atomic copy, but will only run any enabled compression algorithm on the + data that would have been written (the source pages of the atomic copy in + the case of pageset 1). This is useful for comparing the performance of + compression algorithms and for determining the extent to which an upgrade + to your storage method would improve hibernation speed. + + - user_interface/debug_sections (CONFIG_PM_DEBUG): + + This value, together with the console log level, controls what debugging + information is displayed. The console log level determines the level of + detail, and this value determines what detail is displayed. This value is + a bit vector, and the meaning of the bits can be found in the kernel tree + in include/linux/tuxonice.h. It can be overridden using the kernel's + command line option suspend_dbg. + + - user_interface/default_console_level (CONFIG_PM_DEBUG): + + This determines the value of the console log level at the start of a + hibernation cycle. If debugging is compiled in, the console log level can be + changed during a cycle by pressing the digit keys. Meanings are: + + 0: Nice display. + 1: Nice display plus numerical progress. + 2: Errors only. + 3: Low level debugging info. + 4: Medium level debugging info. + 5: High level debugging info. + 6: Verbose debugging info. + + - user_interface/enable_escape: + + Setting this to "1" will enable you abort a hibernation cycle or resuming by + pressing escape, "0" (default) disables this feature. Note that enabling + this option means that you cannot initiate a hibernation cycle and then walk + away from your computer, expecting it to be secure. With feature disabled, + you can validly have this expectation once TuxOnice begins to write the + image to disk. (Prior to this point, it is possible that TuxOnice might + about because of failure to freeze all processes or because constraints + on its ability to save the image are not met). + + - user_interface/program + + This entry is used to tell TuxOnice what userspace program to use for + providing a user interface while hibernating. The program uses a netlink + socket to pass messages back and forward to the kernel, allowing all of the + functions formerly implemented in the kernel user interface components. + + - version: + + The version of TuxOnIce you have compiled into the currently running kernel. + + - wake_alarm_dir: + + As mentioned above (post_wake_state), TuxOnIce supports automatically waking + after some delay. This entry allows you to select which wake alarm to use. + It should contain the value "rtc0" if you're wanting to use + /sys/class/rtc/rtc0. + + - wake_delay: + + This value determines the delay from the end of writing the image until the + wake alarm is triggered. You can set an absolute time by writing the desired + time into /sys/class/rtc//wakealarm and leaving these values + empty. + + Note that for the wakeup to actually occur, you may need to modify entries + in /proc/acpi/wakeup. This is done by echoing the name of the button in the + first column (eg PBTN) into the file. + +7. How do you get support? + + Glad you asked. TuxOnIce is being actively maintained and supported + by Nigel (the guy doing most of the kernel coding at the moment), Bernard + (who maintains the hibernate script and userspace user interface components) + and its users. + + Resources availble include HowTos, FAQs and a Wiki, all available via + tuxonice.net. You can find the mailing lists there. + +8. I think I've found a bug. What should I do? + + By far and a way, the most common problems people have with TuxOnIce + related to drivers not having adequate power management support. In this + case, it is not a bug with TuxOnIce, but we can still help you. As we + mentioned above, such issues can usually be worked around by building the + functionality as modules and unloading them while hibernating. Please visit + the Wiki for up-to-date lists of known issues and work arounds. + + If this information doesn't help, try running: + + hibernate --bug-report + + ..and sending the output to the users mailing list. + + Good information on how to provide us with useful information from an + oops is found in the file REPORTING-BUGS, in the top level directory + of the kernel tree. If you get an oops, please especially note the + information about running what is printed on the screen through ksymoops. + The raw information is useless. + +9. When will XXX be supported? + + If there's a feature missing from TuxOnIce that you'd like, feel free to + ask. We try to be obliging, within reason. + + Patches are welcome. Please send to the list. + +10. How does it work? + + TuxOnIce does its work in a number of steps. + + a. Freezing system activity. + + The first main stage in hibernating is to stop all other activity. This is + achieved in stages. Processes are considered in fours groups, which we will + describe in reverse order for clarity's sake: Threads with the PF_NOFREEZE + flag, kernel threads without this flag, userspace processes with the + PF_SYNCTHREAD flag and all other processes. The first set (PF_NOFREEZE) are + untouched by the refrigerator code. They are allowed to run during hibernating + and resuming, and are used to support user interaction, storage access or the + like. Other kernel threads (those unneeded while hibernating) are frozen last. + This leaves us with userspace processes that need to be frozen. When a + process enters one of the *_sync system calls, we set a PF_SYNCTHREAD flag on + that process for the duration of that call. Processes that have this flag are + frozen after processes without it, so that we can seek to ensure that dirty + data is synced to disk as quickly as possible in a situation where other + processes may be submitting writes at the same time. Freezing the processes + that are submitting data stops new I/O from being submitted. Syncthreads can + then cleanly finish their work. So the order is: + + - Userspace processes without PF_SYNCTHREAD or PF_NOFREEZE; + - Userspace processes with PF_SYNCTHREAD (they won't have NOFREEZE); + - Kernel processes without PF_NOFREEZE. + + b. Eating memory. + + For a successful hibernation cycle, you need to have enough disk space to store the + image and enough memory for the various limitations of TuxOnIce's + algorithm. You can also specify a maximum image size. In order to attain + to those constraints, TuxOnIce may 'eat' memory. If, after freezing + processes, the constraints aren't met, TuxOnIce will thaw all the + other processes and begin to eat memory until its calculations indicate + the constraints are met. It will then freeze processes again and recheck + its calculations. + + c. Allocation of storage. + + Next, TuxOnIce allocates the storage that will be used to save + the image. + + The core of TuxOnIce knows nothing about how or where pages are stored. We + therefore request the active allocator (remember you might have compiled in + more than one!) to allocate enough storage for our expect image size. If + this request cannot be fulfilled, we eat more memory and try again. If it + is fulfiled, we seek to allocate additional storage, just in case our + expected compression ratio (if any) isn't achieved. This time, however, we + just continue if we can't allocate enough storage. + + If these calls to our allocator change the characteristics of the image + such that we haven't allocated enough memory, we also loop. (The allocator + may well need to allocate space for its storage information). + + d. Write the first part of the image. + + TuxOnIce stores the image in two sets of pages called 'pagesets'. + Pageset 2 contains pages on the active and inactive lists; essentially + the page cache. Pageset 1 contains all other pages, including the kernel. + We use two pagesets for one important reason: We need to make an atomic copy + of the kernel to ensure consistency of the image. Without a second pageset, + that would limit us to an image that was at most half the amount of memory + available. Using two pagesets allows us to store a full image. Since pageset + 2 pages won't be needed in saving pageset 1, we first save pageset 2 pages. + We can then make our atomic copy of the remaining pages using both pageset 2 + pages and any other pages that are free. While saving both pagesets, we are + careful not to corrupt the image. Among other things, we use lowlevel block + I/O routines that don't change the pagecache contents. + + The next step, then, is writing pageset 2. + + e. Suspending drivers and storing processor context. + + Having written pageset2, TuxOnIce calls the power management functions to + notify drivers of the hibernation, and saves the processor state in preparation + for the atomic copy of memory we are about to make. + + f. Atomic copy. + + At this stage, everything else but the TuxOnIce code is halted. Processes + are frozen or idling, drivers are quiesced and have stored (ideally and where + necessary) their configuration in memory we are about to atomically copy. + In our lowlevel architecture specific code, we have saved the CPU state. + We can therefore now do our atomic copy before resuming drivers etc. + + g. Save the atomic copy (pageset 1). + + TuxOnice can then write the atomic copy of the remaining pages. Since we + have copied the pages into other locations, we can continue to use the + normal block I/O routines without fear of corruption our image. + + f. Save the image header. + + Nearly there! We save our settings and other parameters needed for + reloading pageset 1 in an 'image header'. We also tell our allocator to + serialise its data at this stage, so that it can reread the image at resume + time. + + g. Set the image header. + + Finally, we edit the header at our resume= location. The signature is + changed by the allocator to reflect the fact that an image exists, and to + point to the start of that data if necessary (swap allocator). + + h. Power down. + + Or reboot if we're debugging and the appropriate option is selected. + + Whew! + + Reloading the image. + -------------------- + + Reloading the image is essentially the reverse of all the above. We load + our copy of pageset 1, being careful to choose locations that aren't going + to be overwritten as we copy it back (We start very early in the boot + process, so there are no other processes to quiesce here). We then copy + pageset 1 back to its original location in memory and restore the process + context. We are now running with the original kernel. Next, we reload the + pageset 2 pages, free the memory and swap used by TuxOnIce, restore + the pageset header and restart processes. Sounds easy in comparison to + hibernating, doesn't it! + + There is of course more to TuxOnIce than this, but this explanation + should be a good start. If there's interest, I'll write further + documentation on range pages and the low level I/O. + +11. Who wrote TuxOnIce? + + (Answer based on the writings of Florent Chabaud, credits in files and + Nigel's limited knowledge; apologies to anyone missed out!) + + The main developers of TuxOnIce have been... + + Gabor Kuti + Pavel Machek + Florent Chabaud + Bernard Blackham + Nigel Cunningham + + Significant portions of swsusp, the code in the vanilla kernel which + TuxOnIce enhances, have been worked on by Rafael Wysocki. Thanks should + also be expressed to him. + + The above mentioned developers have been aided in their efforts by a host + of hundreds, if not thousands of testers and people who have submitted bug + fixes & suggestions. Of special note are the efforts of Michael Frank, who + had his computers repetitively hibernate and resume for literally tens of + thousands of cycles and developed scripts to stress the system and test + TuxOnIce far beyond the point most of us (Nigel included!) would consider + testing. His efforts have contributed as much to TuxOnIce as any of the + names above. diff -Nur linux-4.2/Documentation/scsi/link_power_management_policy.txt linux-4.2-pck/Documentation/scsi/link_power_management_policy.txt --- linux-4.2/Documentation/scsi/link_power_management_policy.txt 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/Documentation/scsi/link_power_management_policy.txt 2015-09-08 00:48:22.211698004 -0300 @@ -1,8 +1,11 @@ This parameter allows the user to set the link (interface) power management. -There are 3 possible options: +There are 4 possible options: Value Effect ---------------------------------------------------------------------------- +firmware_defaults Inherit configuration from the state programmed by + the firmware during system init. + min_power Tell the controller to try to make the link use the least possible power when possible. This may sacrifice some performance due to increased latency diff -Nur linux-4.2/Documentation/tp_smapi.txt linux-4.2-pck/Documentation/tp_smapi.txt --- linux-4.2/Documentation/tp_smapi.txt 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/tp_smapi.txt 2015-09-08 00:48:22.211698004 -0300 @@ -0,0 +1,267 @@ +tp_smapi version 0.40 +IBM ThinkPad hardware functions driver + +Author: Shem Multinymous +Project: http://sourceforge.net/projects/tpctl +Wiki: http://thinkwiki.org/wiki/tp_smapi +List: linux-thinkpad@linux-thinkpad.org + (http://mailman.linux-thinkpad.org/mailman/listinfo/linux-thinkpad) + +Description +----------- + +ThinkPad laptops include a proprietary interface called SMAPI BIOS +(System Management Application Program Interface) which provides some +hardware control functionality that is not accessible by other means. + +This driver exposes some features of the SMAPI BIOS through a sysfs +interface. It is suitable for newer models, on which SMAPI is invoked +through IO port writes. Older models use a different SMAPI interface; +for those, try the "thinkpad" module from the "tpctl" package. + +WARNING: +This driver uses undocumented features and direct hardware access. +It thus cannot be guaranteed to work, and may cause arbitrary damage +(especially on models it wasn't tested on). + + +Module parameters +----------------- + +thinkpad_ec module: + force_io=1 lets thinkpad_ec load on some recent ThinkPad models + (e.g., T400 and T500) whose BIOS's ACPI DSDT reserves the ports we need. +tp_smapi module: + debug=1 enables verbose dmesg output. + + +Usage +----- + +Control of battery charging thresholds (in percents of current full charge +capacity): + +# echo 40 > /sys/devices/platform/smapi/BAT0/start_charge_thresh +# echo 70 > /sys/devices/platform/smapi/BAT0/stop_charge_thresh +# cat /sys/devices/platform/smapi/BAT0/*_charge_thresh + + (This is useful since Li-Ion batteries wear out much faster at very + high or low charge levels. The driver will also keeps the thresholds + across suspend-to-disk with AC disconnected; this isn't done + automatically by the hardware.) + +Inhibiting battery charging for 17 minutes (overrides thresholds): + +# echo 17 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes +# echo 0 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes # stop +# cat /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes + + (This can be used to control which battery is charged when using an + Ultrabay battery.) + +Forcing battery discharging even if AC power available: + +# echo 1 > /sys/devices/platform/smapi/BAT0/force_discharge # start discharge +# echo 0 > /sys/devices/platform/smapi/BAT0/force_discharge # stop discharge +# cat /sys/devices/platform/smapi/BAT0/force_discharge + + (When AC is connected, forced discharging will automatically stop + when battery is fully depleted -- this is useful for calibration. + Also, this attribute can be used to control which battery is discharged + when both a system battery and an Ultrabay battery are connected.) + +Misc read-only battery status attributes (see note about HDAPS below): + +/sys/devices/platform/smapi/BAT0/installed # 0 or 1 +/sys/devices/platform/smapi/BAT0/state # idle/charging/discharging +/sys/devices/platform/smapi/BAT0/cycle_count # integer counter +/sys/devices/platform/smapi/BAT0/current_now # instantaneous current +/sys/devices/platform/smapi/BAT0/current_avg # last minute average +/sys/devices/platform/smapi/BAT0/power_now # instantaneous power +/sys/devices/platform/smapi/BAT0/power_avg # last minute average +/sys/devices/platform/smapi/BAT0/last_full_capacity # in mWh +/sys/devices/platform/smapi/BAT0/remaining_percent # remaining percent of energy (set by calibration) +/sys/devices/platform/smapi/BAT0/remaining_percent_error # error range of remaing_percent (not reset by calibration) +/sys/devices/platform/smapi/BAT0/remaining_running_time # in minutes, by last minute average power +/sys/devices/platform/smapi/BAT0/remaining_running_time_now # in minutes, by instantenous power +/sys/devices/platform/smapi/BAT0/remaining_charging_time # in minutes +/sys/devices/platform/smapi/BAT0/remaining_capacity # in mWh +/sys/devices/platform/smapi/BAT0/design_capacity # in mWh +/sys/devices/platform/smapi/BAT0/voltage # in mV +/sys/devices/platform/smapi/BAT0/design_voltage # in mV +/sys/devices/platform/smapi/BAT0/charging_max_current # max charging current +/sys/devices/platform/smapi/BAT0/charging_max_voltage # max charging voltage +/sys/devices/platform/smapi/BAT0/group{0,1,2,3}_voltage # see below +/sys/devices/platform/smapi/BAT0/manufacturer # string +/sys/devices/platform/smapi/BAT0/model # string +/sys/devices/platform/smapi/BAT0/barcoding # string +/sys/devices/platform/smapi/BAT0/chemistry # string +/sys/devices/platform/smapi/BAT0/serial # integer +/sys/devices/platform/smapi/BAT0/manufacture_date # YYYY-MM-DD +/sys/devices/platform/smapi/BAT0/first_use_date # YYYY-MM-DD +/sys/devices/platform/smapi/BAT0/temperature # in milli-Celsius +/sys/devices/platform/smapi/BAT0/dump # see below +/sys/devices/platform/smapi/ac_connected # 0 or 1 + +The BAT0/group{0,1,2,3}_voltage attribute refers to the separate cell groups +in each battery. For example, on the ThinkPad 600, X3x, T4x and R5x models, +the battery contains 3 cell groups in series, where each group consisting of 2 +or 3 cells connected in parallel. The voltage of each group is given by these +attributes, and their sum (roughly) equals the "voltage" attribute. +(The effective performance of the battery is determined by the weakest group, +i.e., the one those voltage changes most rapidly during dis/charging.) + +The "BAT0/dump" attribute gives a a hex dump of the raw status data, which +contains additional data now in the above (if you can figure it out). Some +unused values are autodetected and replaced by "--": + +In all of the above, replace BAT0 with BAT1 to address the 2nd battery (e.g. +in the UltraBay). + + +Raw SMAPI calls: + +/sys/devices/platform/smapi/smapi_request +This performs raw SMAPI calls. It uses a bad interface that cannot handle +multiple simultaneous access. Don't touch it, it's for development only. +If you did touch it, you would so something like +# echo '211a 100 0 0' > /sys/devices/platform/smapi/smapi_request +# cat /sys/devices/platform/smapi/smapi_request +and notice that in the output "211a 34b b2 0 0 0 'OK'", the "4b" in the 2nd +value, converted to decimal is 75: the current charge stop threshold. + + +Model-specific status +--------------------- + +Works (at least partially) on the following ThinkPad model: +* A30 +* G41 +* R40, R50p, R51, R52 +* T23, T40, T40p, T41, T41p, T42, T42p, T43, T43p, T60 +* X24, X31, X32, X40, X41, X60 +* Z60t, Z61m + +Not all functions are available on all models; for detailed status, see: + http://thinkwiki.org/wiki/tp_smapi + +Please report success/failure by e-mail or on the Wiki. +If you get a "not implemented" or "not supported" message, your laptop +probably just can't do that (at least not via the SMAPI BIOS). +For negative reports, follow the bug reporting guidelines below. +If you send me the necessary technical data (i.e., SMAPI function +interfaces), I will support additional models. + + +Additional HDAPS features +------------------------- + +The modified hdaps driver has several improvements on the one in mainline +(beyond resolving the conflict with thinkpad_ec and tp_smapi): + +- Fixes reliability and improves support for recent ThinkPad models + (especially *60 and newer). Unlike the mainline driver, the modified hdaps + correctly follows the Embedded Controller communication protocol. + +- Extends the "invert" parameter to cover all possible axis orientations. + The possible values are as follows. + Let X,Y denote the hardware readouts. + Let R denote the laptop's roll (tilt left/right). + Let P denote the laptop's pitch (tilt forward/backward). + invert=0: R= X P= Y (same as mainline) + invert=1: R=-X P=-Y (same as mainline) + invert=2: R=-X P= Y (new) + invert=3: R= X P=-Y (new) + invert=4: R= Y P= X (new) + invert=5: R=-Y P=-X (new) + invert=6: R=-Y P= X (new) + invert=7: R= Y P=-X (new) + It's probably easiest to just try all 8 possibilities and see which yields + correct results (e.g., in the hdaps-gl visualisation). + +- Adds a whitelist which automatically sets the correct axis orientation for + some models. If the value for your model is wrong or missing, you can override + it using the "invert" parameter. Please also update the tables at + http://www.thinkwiki.org/wiki/tp_smapi and + http://www.thinkwiki.org/wiki/List_of_DMI_IDs + and submit a patch for the whitelist in hdaps.c. + +- Provides new attributes: + /sys/devices/platform/hdaps/sampling_rate: + This determines the frequency at which the host queries the embedded + controller for accelerometer data (and informs the hdaps input devices). + Default=50. + /sys/devices/platform/hdaps/oversampling_ratio: + When set to X, the embedded controller is told to do physical accelerometer + measurements at a rate that is X times higher than the rate at which + the driver reads those measurements (i.e., X*sampling_rate). This + makes the readouts from the embedded controller more fresh, and is also + useful for the running average filter (see next). Default=5 + /sys/devices/platform/hdaps/running_avg_filter_order: + When set to X, reported readouts will be the average of the last X physical + accelerometer measurements. Current firmware allows 1<=X<=8. Setting to a + high value decreases readout fluctuations. The averaging is handled by the + embedded controller, so no CPU resources are used. Higher values make the + readouts smoother, since it averages out both sensor noise (good) and abrupt + changes (bad). Default=2. + +- Provides a second input device, which publishes the raw accelerometer + measurements (without the fuzzing needed for joystick emulation). This input + device can be matched by a udev rule such as the following (all on one line): + KERNEL=="event[0-9]*", ATTRS{phys}=="hdaps/input1", + ATTRS{modalias}=="input:b0019v1014p5054e4801-*", + SYMLINK+="input/hdaps/accelerometer-event + +A new version of the hdapsd userspace daemon, which uses the input device +interface instead of polling sysfs, is available seprately. Using this reduces +the total interrupts per second generated by hdaps+hdapsd (on tickless kernels) +to 50, down from a value that fluctuates between 50 and 100. Set the +sampling_rate sysfs attribute to a lower value to further reduce interrupts, +at the expense of response latency. + +Licensing note: all my changes to the HDAPS driver are licensed under the +GPL version 2 or, at your option and to the extent allowed by derivation from +prior works, any later version. My version of hdaps is derived work from the +mainline version, which at the time of writing is available only under +GPL version 2. + +Bug reporting +------------- + +Mail . Please include: +* Details about your model, +* Relevant "dmesg" output. Make sure thinkpad_ec and tp_smapi are loaded with + the "debug=1" parameter (e.g., use "make load HDAPS=1 DEBUG=1"). +* Output of "dmidecode | grep -C5 Product" +* Does the failed functionality works under Windows? + + +More about SMAPI +---------------- + +For hints about what may be possible via the SMAPI BIOS and how, see: + +* IBM Technical Reference Manual for the ThinkPad 770 + (http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD) +* Exported symbols in PWRMGRIF.DLL or TPPWRW32.DLL (e.g., use "objdump -x"). +* drivers/char/mwave/smapi.c in the Linux kernel tree.* +* The "thinkpad" SMAPI module (http://tpctl.sourceforge.net). +* The SMAPI_* constants in tp_smapi.c. + +Note that in the above Technical Reference and in the "thinkpad" module, +SMAPI is invoked through a function call to some physical address. However, +the interface used by tp_smapi and the above mwave drive, and apparently +required by newer ThinkPad, is different: you set the parameters up in the +CPU's registers and write to ports 0xB2 (the APM control port) and 0x4F; this +triggers an SMI (System Management Interrupt), causing the CPU to enter +SMM (System Management Mode) and run the BIOS firmware; the results are +returned in the CPU's registers. It is not clear what is the relation between +the two variants of SMAPI, though the assignment of error codes seems to be +similar. + +In addition, the embedded controller on ThinkPad laptops has a non-standard +interface at IO ports 0x1600-0x161F (mapped to LCP channel 3 of the H8S chip). +The interface provides various system management services (currently known: +battery information and accelerometer readouts). For more information see the +thinkpad_ec module and the H8S hardware documentation: +http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf diff -Nur linux-4.2/Documentation/vm/00-INDEX linux-4.2-pck/Documentation/vm/00-INDEX --- linux-4.2/Documentation/vm/00-INDEX 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/Documentation/vm/00-INDEX 2015-09-08 00:51:03.460668141 -0300 @@ -16,6 +16,8 @@ - explains what hwpoison is ksm.txt - how to use the Kernel Samepage Merging feature. +uksm.txt + - Introduction to Ultra KSM numa - information about NUMA specific code in the Linux vm. numa_memory_policy.txt diff -Nur linux-4.2/Documentation/vm/uksm.txt linux-4.2-pck/Documentation/vm/uksm.txt --- linux-4.2/Documentation/vm/uksm.txt 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/Documentation/vm/uksm.txt 2015-09-08 00:51:03.460668141 -0300 @@ -0,0 +1,59 @@ +The Ultra Kernel Samepage Merging feature +---------------------------------------------- +/* + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia + * + * This is an improvement upon KSM. Some basic data structures and routines + * are borrowed from ksm.c . + * + * Its new features: + * 1. Full system scan: + * It automatically scans all user processes' anonymous VMAs. Kernel-user + * interaction to submit a memory area to KSM is no longer needed. + * + * 2. Rich area detection: + * It automatically detects rich areas containing abundant duplicated + * pages based. Rich areas are given a full scan speed. Poor areas are + * sampled at a reasonable speed with very low CPU consumption. + * + * 3. Ultra Per-page scan speed improvement: + * A new hash algorithm is proposed. As a result, on a machine with + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it + * can scan memory areas that does not contain duplicated pages at speed of + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of + * 477MB/sec ~ 923MB/sec. + * + * 4. Thrashing area avoidance: + * Thrashing area(an VMA that has frequent Ksm page break-out) can be + * filtered out. My benchmark shows it's more efficient than KSM's per-page + * hash value based volatile page detection. + * + * + * 5. Misc changes upon KSM: + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page + * comparison. It's much faster than default C version on x86. + * * rmap_item now has an struct *page member to loosely cache a + * address-->page mapping, which reduces too much time-costly + * follow_page(). + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ + * ksm is needed for this case. + * + * 6. Full Zero Page consideration(contributed by Figo Zhang) + * Now uksmd consider full zero pages as special pages and merge them to an + * special unswappable uksm zero page. + */ + +ChangeLog: + +2012-05-05 The creation of this Doc +2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. +2012-05-28 UKSM 0.1.1.2 bug fix release +2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 +2012-07-2 UKSM 0.1.2-beta2 +2012-07-10 UKSM 0.1.2-beta3 +2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. +2012-10-13 UKSM 0.1.2.1 Bug fixes. +2012-12-31 UKSM 0.1.2.2 Minor bug fixes. +2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug". +2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings. diff -Nur linux-4.2/MAINTAINERS linux-4.2-pck/MAINTAINERS --- linux-4.2/MAINTAINERS 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/MAINTAINERS 2015-09-08 11:20:31.817043710 -0300 @@ -5807,6 +5807,19 @@ F: Documentation/kbuild/kconfig-language.txt F: scripts/kconfig/ +KDBUS +M: Greg Kroah-Hartman +M: Daniel Mack +M: David Herrmann +M: Djalal Harouni +L: linux-kernel@vger.kernel.org +S: Maintained +F: ipc/kdbus/* +F: samples/kdbus/* +F: Documentation/kdbus/* +F: include/uapi/linux/kdbus.h +F: tools/testing/selftests/kdbus/ + KDUMP M: Vivek Goyal M: Haren Myneni @@ -10464,6 +10477,13 @@ F: drivers/tc/ F: include/linux/tc.h +TUXONICE (ENHANCED HIBERNATION) +P: Nigel Cunningham +M: nigel@nigelcunningham.com.au +L: tuxonice-devel@tuxonice.net +W: http://tuxonice.net +S: Maintained + U14-34F SCSI DRIVER M: Dario Ballabio L: linux-scsi@vger.kernel.org diff -Nur linux-4.2/Makefile linux-4.2-pck/Makefile --- linux-4.2/Makefile 2015-08-30 16:17:42.000000000 -0300 +++ linux-4.2-pck/Makefile 2015-09-15 00:10:15.323170526 -0300 @@ -1346,6 +1346,7 @@ %docs: scripts_basic FORCE $(Q)$(MAKE) $(build)=scripts build_docproc $(Q)$(MAKE) $(build)=Documentation/DocBook $@ + $(Q)$(MAKE) $(build)=Documentation/kdbus $@ else # KBUILD_EXTMOD diff -Nur linux-4.2/arch/x86/Kconfig.cpu linux-4.2-pck/arch/x86/Kconfig.cpu --- linux-4.2/arch/x86/Kconfig.cpu 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/Kconfig.cpu 2015-09-08 00:50:52.507860491 -0300 @@ -137,9 +137,8 @@ -Paxville -Dempsey - config MK6 - bool "K6/K6-II/K6-III" + bool "AMD K6/K6-II/K6-III" depends on X86_32 ---help--- Select this for an AMD K6-family processor. Enables use of @@ -147,7 +146,7 @@ flags to GCC. config MK7 - bool "Athlon/Duron/K7" + bool "AMD Athlon/Duron/K7" depends on X86_32 ---help--- Select this for an AMD Athlon K7-family processor. Enables use of @@ -155,12 +154,62 @@ flags to GCC. config MK8 - bool "Opteron/Athlon64/Hammer/K8" + bool "AMD Opteron/Athlon64/Hammer/K8" ---help--- Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables use of some extended instructions, and passes appropriate optimization flags to GCC. +config MK8SSE3 + bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" + ---help--- + Select this for improved AMD Opteron or Athlon64 Hammer-family processors. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MK10 + bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" + ---help--- + Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, + Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. + Enables use of some extended instructions, and passes appropriate + optimization flags to GCC. + +config MBARCELONA + bool "AMD Barcelona" + ---help--- + Select this for AMD Barcelona and newer processors. + + Enables -march=barcelona + +config MBOBCAT + bool "AMD Bobcat" + ---help--- + Select this for AMD Bobcat processors. + + Enables -march=btver1 + +config MBULLDOZER + bool "AMD Bulldozer" + ---help--- + Select this for AMD Bulldozer processors. + + Enables -march=bdver1 + +config MPILEDRIVER + bool "AMD Piledriver" + ---help--- + Select this for AMD Piledriver processors. + + Enables -march=bdver2 + +config MJAGUAR + bool "AMD Jaguar" + ---help--- + Select this for AMD Jaguar processors. + + Enables -march=btver2 + config MCRUSOE bool "Crusoe" depends on X86_32 @@ -251,8 +300,17 @@ using the cpu family field in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. +config MATOM + bool "Intel Atom" + ---help--- + + Select this for the Intel Atom platform. Intel Atom CPUs have an + in-order pipelining architecture and thus can benefit from + accordingly optimized code. Use a recent GCC with specific Atom + support in order to fully benefit from selecting this option. + config MCORE2 - bool "Core 2/newer Xeon" + bool "Intel Core 2" ---help--- Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and @@ -260,14 +318,63 @@ family in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) -config MATOM - bool "Intel Atom" + Enables -march=core2 + +config MNEHALEM + bool "Intel Nehalem" ---help--- - Select this for the Intel Atom platform. Intel Atom CPUs have an - in-order pipelining architecture and thus can benefit from - accordingly optimized code. Use a recent GCC with specific Atom - support in order to fully benefit from selecting this option. + Select this for 1st Gen Core processors in the Nehalem family. + + Enables -march=nehalem + +config MWESTMERE + bool "Intel Westmere" + ---help--- + + Select this for the Intel Westmere formerly Nehalem-C family. + + Enables -march=westmere + +config MSILVERMONT + bool "Intel Silvermont" + ---help--- + + Select this for the Intel Silvermont platform. + + Enables -march=silvermont + +config MSANDYBRIDGE + bool "Intel Sandy Bridge" + ---help--- + + Select this for 2nd Gen Core processors in the Sandy Bridge family. + + Enables -march=sandybridge + +config MIVYBRIDGE + bool "Intel Ivy Bridge" + ---help--- + + Select this for 3rd Gen Core processors in the Ivy Bridge family. + + Enables -march=ivybridge + +config MHASWELL + bool "Intel Haswell" + ---help--- + + Select this for 4th Gen Core processors in the Haswell family. + + Enables -march=haswell + +config MBROADWELL + bool "Intel Broadwell" + ---help--- + + Select this for 5th Gen Core processors in the Broadwell family. + + Enables -march=broadwell config GENERIC_CPU bool "Generic-x86-64" @@ -276,6 +383,19 @@ Generic x86-64 CPU. Run equally well on all x86-64 CPUs. +config MNATIVE + bool "Native optimizations autodetected by GCC" + ---help--- + + GCC 4.2 and above support -march=native, which automatically detects + the optimum settings to use based on your processor. -march=native + also detects and applies additional settings beyond -march specific + to your CPU, (eg. -msse4). Unless you have a specific reason not to + (e.g. distcc cross-compiling), you should probably be using + -march=native rather than anything listed below. + + Enables -march=native + endchoice config X86_GENERIC @@ -290,6 +410,16 @@ This is really intended for distributors who need more generic optimizations. +config X86_MARCH_NATIVE + bool "Use -march=native cflag (EXPERIMENTAL)" + help + Setting Y here, will result in passing the -march=native and + -mtune=native cflags to GCC while compiling the kernel, which + makes GCC check the CPU capabilities and use the best cflags + for your computer. + + Set Y here only if you use >=gcc-4.2.0. + # # Define implied options from the CPU selection here config X86_INTERNODE_CACHE_SHIFT @@ -300,7 +430,7 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU default "4" if MELAN || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX @@ -331,11 +461,11 @@ config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MNATIVE config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MATOM || MNATIVE config X86_USE_3DNOW def_bool y @@ -359,17 +489,17 @@ config X86_TSC def_bool y - depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MNATIVE || MATOM) || X86_64 config X86_CMPXCHG64 def_bool y - depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM + depends on X86_PAE || X86_64 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE # this should be set for all -march=.. options where the compiler # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) + depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) config X86_MINIMUM_CPU_FAMILY int diff -Nur linux-4.2/arch/x86/Makefile linux-4.2-pck/arch/x86/Makefile --- linux-4.2/arch/x86/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/Makefile 2015-09-08 00:50:52.507860491 -0300 @@ -94,13 +94,35 @@ KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) + cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) + cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8) + cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10) + cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona) + cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1) + cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1) + cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2) + cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) cflags-$(CONFIG_MCORE2) += \ - $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) - cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \ - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) + $(call cc-option,-march=core2,$(call cc-option,-mtune=core2)) + cflags-$(CONFIG_MNEHALEM) += \ + $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem)) + cflags-$(CONFIG_MWESTMERE) += \ + $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere)) + cflags-$(CONFIG_MSILVERMONT) += \ + $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont)) + cflags-$(CONFIG_MSANDYBRIDGE) += \ + $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge)) + cflags-$(CONFIG_MIVYBRIDGE) += \ + $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge)) + cflags-$(CONFIG_MHASWELL) += \ + $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell)) + cflags-$(CONFIG_MBROADWELL) += \ + $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell)) + cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \ + $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) KBUILD_CFLAGS += $(cflags-y) diff -Nur linux-4.2/arch/x86/Makefile_32.cpu linux-4.2-pck/arch/x86/Makefile_32.cpu --- linux-4.2/arch/x86/Makefile_32.cpu 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/Makefile_32.cpu 2015-09-08 00:50:52.507860491 -0300 @@ -10,6 +10,9 @@ endif align := $(cc-option-align) +ifeq ($(CONFIG_X86_MARCH_NATIVE),y) +cflags-y += -march=native +else cflags-$(CONFIG_M486) += -march=i486 cflags-$(CONFIG_M586) += -march=i586 cflags-$(CONFIG_M586TSC) += -march=i586 @@ -23,7 +26,15 @@ # Please note, that patches that add -march=athlon-xp and friends are pointless. # They make zero difference whatsosever to performance at this time. cflags-$(CONFIG_MK7) += -march=athlon +cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) +cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-march=athlon) +cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10,-march=athlon) +cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon) +cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon) +cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon) +cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon) +cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon) cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) @@ -32,8 +43,15 @@ cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) -cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) +cflags-$(CONFIG_MNEHALEM) += -march=i686 $(call tune,nehalem) +cflags-$(CONFIG_MWESTMERE) += -march=i686 $(call tune,westmere) +cflags-$(CONFIG_MSILVERMONT) += -march=i686 $(call tune,silvermont) +cflags-$(CONFIG_MSANDYBRIDGE) += -march=i686 $(call tune,sandybridge) +cflags-$(CONFIG_MIVYBRIDGE) += -march=i686 $(call tune,ivybridge) +cflags-$(CONFIG_MHASWELL) += -march=i686 $(call tune,haswell) +cflags-$(CONFIG_MBROADWELL) += -march=i686 $(call tune,broadwell) +cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \ + $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) # AMD Elan support cflags-$(CONFIG_MELAN) += -march=i486 @@ -41,6 +59,8 @@ # Geode GX1 support cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx) +endif + # add at the end to overwrite eventual tuning options from earlier # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) diff -Nur linux-4.2/arch/x86/include/asm/module.h linux-4.2-pck/arch/x86/include/asm/module.h --- linux-4.2/arch/x86/include/asm/module.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/include/asm/module.h 2015-09-08 00:50:52.507860491 -0300 @@ -15,6 +15,22 @@ #define MODULE_PROC_FAMILY "586MMX " #elif defined CONFIG_MCORE2 #define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_MNATIVE +#define MODULE_PROC_FAMILY "NATIVE " +#elif defined CONFIG_MNEHALEM +#define MODULE_PROC_FAMILY "NEHALEM " +#elif defined CONFIG_MWESTMERE +#define MODULE_PROC_FAMILY "WESTMERE " +#elif defined CONFIG_MSILVERMONT +#define MODULE_PROC_FAMILY "SILVERMONT " +#elif defined CONFIG_MSANDYBRIDGE +#define MODULE_PROC_FAMILY "SANDYBRIDGE " +#elif defined CONFIG_MIVYBRIDGE +#define MODULE_PROC_FAMILY "IVYBRIDGE " +#elif defined CONFIG_MHASWELL +#define MODULE_PROC_FAMILY "HASWELL " +#elif defined CONFIG_MBROADWELL +#define MODULE_PROC_FAMILY "BROADWELL " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -33,6 +49,20 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " +#elif defined CONFIG_MK8SSE3 +#define MODULE_PROC_FAMILY "K8SSE3 " +#elif defined CONFIG_MK10 +#define MODULE_PROC_FAMILY "K10 " +#elif defined CONFIG_MBARCELONA +#define MODULE_PROC_FAMILY "BARCELONA " +#elif defined CONFIG_MBOBCAT +#define MODULE_PROC_FAMILY "BOBCAT " +#elif defined CONFIG_MBULLDOZER +#define MODULE_PROC_FAMILY "BULLDOZER " +#elif defined CONFIG_MPILEDRIVER +#define MODULE_PROC_FAMILY "PILEDRIVER " +#elif defined CONFIG_MJAGUAR +#define MODULE_PROC_FAMILY "JAGUAR " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff -Nur linux-4.2/arch/x86/kernel/early_printk.c linux-4.2-pck/arch/x86/kernel/early_printk.c --- linux-4.2/arch/x86/kernel/early_printk.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/kernel/early_printk.c 2015-09-08 00:48:22.215031180 -0300 @@ -27,7 +27,8 @@ static int max_ypos = 25, max_xpos = 80; static int current_ypos = 25, current_xpos; -static void early_vga_write(struct console *con, const char *str, unsigned n) +static void early_vga_write(struct console *con, const char *str, unsigned n, + unsigned int loglevel) { char c; int i, k, j; @@ -118,7 +119,8 @@ return timeout ? 0 : -1; } -static void early_serial_write(struct console *con, const char *s, unsigned n) +static void early_serial_write(struct console *con, const char *s, unsigned n, + unsigned int loglevel) { while (*s && n-- > 0) { if (*s == '\n') diff -Nur linux-4.2/arch/x86/kernel/espfix_64.c linux-4.2-pck/arch/x86/kernel/espfix_64.c --- linux-4.2/arch/x86/kernel/espfix_64.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/kernel/espfix_64.c 2015-09-08 11:20:31.877040782 -0300 @@ -173,6 +173,7 @@ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); pmd_p = (pmd_t *)page_address(page); + SetPageTOI_Untracked(virt_to_page(pmd_p)); pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PUD_CLONES; n++) @@ -185,6 +186,7 @@ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); pte_p = (pte_t *)page_address(page); + SetPageTOI_Untracked(virt_to_page(pte_p)); pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PMD_CLONES; n++) @@ -193,6 +195,7 @@ pte_p = pte_offset_kernel(&pmd, addr); stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); + SetPageTOI_Untracked(virt_to_page(stack_page)); pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff -Nur linux-4.2/arch/x86/kernel/tsc.c linux-4.2-pck/arch/x86/kernel/tsc.c --- linux-4.2/arch/x86/kernel/tsc.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/kernel/tsc.c 2015-09-08 11:20:31.883707121 -0300 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -195,6 +196,10 @@ c2n->head = c2n->data; c2n->tail = c2n->data; + + // Don't let TuxOnIce make data RO - a secondary CPU will cause a triple fault + // if it loads microcode, which then does a printk, which may end up invoking cycles_2_ns + SetPageTOI_Untracked(virt_to_page(c2n)); } static inline unsigned long long cycles_2_ns(unsigned long long cyc) diff -Nur linux-4.2/arch/x86/mm/fault.c linux-4.2-pck/arch/x86/mm/fault.c --- linux-4.2/arch/x86/mm/fault.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/mm/fault.c 2015-09-08 11:20:31.893706633 -0300 @@ -13,6 +13,7 @@ #include /* hstate_index_to_shift */ #include /* prefetchw */ #include /* exception_enter(), ... */ +#include /* incremental image support */ #include /* faulthandler_disabled() */ #include /* dotraplinkage, ... */ @@ -652,6 +653,10 @@ unsigned long flags; int sig; + if (toi_make_writable(init_mm.pgd, address)) { + return; + } + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) { /* @@ -906,10 +911,101 @@ } } +#ifdef CONFIG_TOI_INCREMENTAL +/** + * _toi_do_cbw - Do a copy-before-write before letting the faulting process continue + */ +static void toi_do_cbw(struct page *page) +{ + struct toi_cbw_state *state = this_cpu_ptr(&toi_cbw_states); + + state->active = 1; + wmb(); + + if (state->enabled && state->next && PageTOI_CBW(page)) { + struct toi_cbw *this = state->next; + memcpy(this->virt, page_address(page), PAGE_SIZE); + this->pfn = page_to_pfn(page); + state->next = this->next; + } + + state->active = 0; +} + +/** + * _toi_make_writable - Defuse TOI's write protection + */ +int _toi_make_writable(pte_t *pte) +{ + struct page *page = pte_page(*pte); + if (PageTOI_RO(page)) { + pgd_t *pgd = __va(read_cr3()); + /* + * If this is a TuxOnIce caused fault, we may not have permission to + * write to a page needed to reset the permissions of the original + * page. Use swapper_pg_dir to get around this. + */ + load_cr3(swapper_pg_dir); + + set_pte_atomic(pte, pte_mkwrite(*pte)); + SetPageTOI_Dirty(page); + ClearPageTOI_RO(page); + + toi_do_cbw(page); + + load_cr3(pgd); + return 1; + } + return 0; +} + +/** + * toi_make_writable - Handle a (potential) fault caused by TOI's write protection + * + * Make a page writable that was protected. Might be because of a fault, or + * because we're allocating it and want it to be untracked. + * + * Note that in the fault handling case, we don't care about the error code. If + * called from the double fault handler, we won't have one. We just check to + * see if the page was made RO by TOI, and mark it dirty/release the protection + * if it was. + */ +int toi_make_writable(pgd_t *pgd, unsigned long address) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd + pgd_index(address); + if (!pgd_present(*pgd)) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + if (pud_large(*pud)) + return _toi_make_writable((pte_t *) pud); + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + + if (pmd_large(*pmd)) + return _toi_make_writable((pte_t *) pmd); + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return 0; + + return _toi_make_writable(pte); +} +#endif + static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) - return 0; + return 0; if ((error_code & PF_INSTR) && !pte_exec(*pte)) return 0; @@ -1072,6 +1168,15 @@ kmemcheck_hide(regs); prefetchw(&mm->mmap_sem); + /* + * Detect and handle page faults due to TuxOnIce making pages read-only + * so that it can create incremental images. + * + * Do it early to avoid double faults. + */ + if (unlikely(toi_make_writable(init_mm.pgd, address))) + return; + if (unlikely(kmmio_fault(regs, address))) return; diff -Nur linux-4.2/arch/x86/mm/init.c linux-4.2-pck/arch/x86/mm/init.c --- linux-4.2/arch/x86/mm/init.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/mm/init.c 2015-09-08 11:20:31.897039804 -0300 @@ -147,9 +147,10 @@ static void __init probe_page_size_mask(void) { -#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) +#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_TOI_INCREMENTAL) /* - * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * For CONFIG_DEBUG_PAGEALLOC or TuxOnIce's incremental image support, + * identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ diff -Nur linux-4.2/arch/x86/platform/efi/early_printk.c linux-4.2-pck/arch/x86/platform/efi/early_printk.c --- linux-4.2/arch/x86/platform/efi/early_printk.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/arch/x86/platform/efi/early_printk.c 2015-09-08 00:48:22.215031180 -0300 @@ -124,7 +124,8 @@ } static void -early_efi_write(struct console *con, const char *str, unsigned int num) +early_efi_write(struct console *con, const char *str, unsigned int num, + unsigned int loglevel) { struct screen_info *si; unsigned int len; diff -Nur linux-4.2/block/Kconfig.iosched linux-4.2-pck/block/Kconfig.iosched --- linux-4.2/block/Kconfig.iosched 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/block/Kconfig.iosched 2015-09-08 00:48:22.215031180 -0300 @@ -39,9 +39,30 @@ ---help--- Enable group IO scheduling in CFQ. +config IOSCHED_BFQ + tristate "BFQ I/O scheduler" + default y + ---help--- + The BFQ I/O scheduler tries to distribute bandwidth among + all processes according to their weights. + It aims at distributing the bandwidth as desired, independently of + the disk parameters and with any workload. It also tries to + guarantee low latency to interactive and soft real-time + applications. If compiled built-in (saying Y here), BFQ can + be configured to support hierarchical scheduling. + +config CGROUP_BFQIO + bool "BFQ hierarchical scheduling support" + depends on CGROUPS && IOSCHED_BFQ=y + default n + ---help--- + Enable hierarchical scheduling in BFQ, using the cgroups + filesystem interface. The name of the subsystem will be + bfqio. + choice prompt "Default I/O scheduler" - default DEFAULT_CFQ + default DEFAULT_BFQ help Select the I/O scheduler which will be used by default for all block devices. @@ -52,6 +73,16 @@ config DEFAULT_CFQ bool "CFQ" if IOSCHED_CFQ=y + config DEFAULT_BFQ + bool "BFQ" if IOSCHED_BFQ=y + help + Selects BFQ as the default I/O scheduler which will be + used by default for all block devices. + The BFQ I/O scheduler aims at distributing the bandwidth + as desired, independently of the disk parameters and with + any workload. It also tries to guarantee low latency to + interactive and soft real-time applications. + config DEFAULT_NOOP bool "No-op" @@ -61,6 +92,7 @@ string default "deadline" if DEFAULT_DEADLINE default "cfq" if DEFAULT_CFQ + default "bfq" if DEFAULT_BFQ default "noop" if DEFAULT_NOOP endmenu diff -Nur linux-4.2/block/Makefile linux-4.2-pck/block/Makefile --- linux-4.2/block/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/block/Makefile 2015-09-08 11:20:31.933704681 -0300 @@ -7,7 +7,7 @@ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ - genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ + uuid.o genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ partitions/ obj-$(CONFIG_BOUNCE) += bounce.o @@ -18,6 +18,7 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o +obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o diff -Nur linux-4.2/block/bfq-cgroup.c linux-4.2-pck/block/bfq-cgroup.c --- linux-4.2/block/bfq-cgroup.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/block/bfq-cgroup.c 2015-09-08 00:48:22.215031180 -0300 @@ -0,0 +1,936 @@ +/* + * BFQ: CGROUPS support. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. + */ + +#ifdef CONFIG_CGROUP_BFQIO + +static DEFINE_MUTEX(bfqio_mutex); + +static bool bfqio_is_removed(struct bfqio_cgroup *bgrp) +{ + return bgrp ? !bgrp->online : false; +} + +static struct bfqio_cgroup bfqio_root_cgroup = { + .weight = BFQ_DEFAULT_GRP_WEIGHT, + .ioprio = BFQ_DEFAULT_GRP_IOPRIO, + .ioprio_class = BFQ_DEFAULT_GRP_CLASS, +}; + +static inline void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) +{ + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class; + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; +} + +static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct bfqio_cgroup, css) : NULL; +} + +/* + * Search the bfq_group for bfqd into the hash table (by now only a list) + * of bgrp. Must be called under rcu_read_lock(). + */ +static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, + struct bfq_data *bfqd) +{ + struct bfq_group *bfqg; + void *key; + + hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) { + key = rcu_dereference(bfqg->bfqd); + if (key == bfqd) + return bfqg; + } + + return NULL; +} + +static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, + struct bfq_group *bfqg) +{ + struct bfq_entity *entity = &bfqg->entity; + + /* + * If the weight of the entity has never been set via the sysfs + * interface, then bgrp->weight == 0. In this case we initialize + * the weight from the current ioprio value. Otherwise, the group + * weight, if set, has priority over the ioprio value. + */ + if (bgrp->weight == 0) { + entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio); + entity->new_ioprio = bgrp->ioprio; + } else { + if (bgrp->weight < BFQ_MIN_WEIGHT || + bgrp->weight > BFQ_MAX_WEIGHT) { + printk(KERN_CRIT "bfq_group_init_entity: " + "bgrp->weight %d\n", bgrp->weight); + BUG(); + } + entity->new_weight = bgrp->weight; + entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight); + } + entity->orig_weight = entity->weight = entity->new_weight; + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; + entity->my_sched_data = &bfqg->sched_data; + bfqg->active_entities = 0; +} + +static inline void bfq_group_set_parent(struct bfq_group *bfqg, + struct bfq_group *parent) +{ + struct bfq_entity *entity; + + BUG_ON(parent == NULL); + BUG_ON(bfqg == NULL); + + entity = &bfqg->entity; + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; +} + +/** + * bfq_group_chain_alloc - allocate a chain of groups. + * @bfqd: queue descriptor. + * @css: the leaf cgroup_subsys_state this chain starts from. + * + * Allocate a chain of groups starting from the one belonging to + * @cgroup up to the root cgroup. Stop if a cgroup on the chain + * to the root has already an allocated group on @bfqd. + */ +static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, + struct cgroup_subsys_state *css) +{ + struct bfqio_cgroup *bgrp; + struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; + + for (; css != NULL; css = css->parent) { + bgrp = css_to_bfqio(css); + + bfqg = bfqio_lookup_group(bgrp, bfqd); + if (bfqg != NULL) { + /* + * All the cgroups in the path from there to the + * root must have a bfq_group for bfqd, so we don't + * need any more allocations. + */ + break; + } + + bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); + if (bfqg == NULL) + goto cleanup; + + bfq_group_init_entity(bgrp, bfqg); + bfqg->my_entity = &bfqg->entity; + + if (leaf == NULL) { + leaf = bfqg; + prev = leaf; + } else { + bfq_group_set_parent(prev, bfqg); + /* + * Build a list of allocated nodes using the bfqd + * filed, that is still unused and will be + * initialized only after the node will be + * connected. + */ + prev->bfqd = bfqg; + prev = bfqg; + } + } + + return leaf; + +cleanup: + while (leaf != NULL) { + prev = leaf; + leaf = leaf->bfqd; + kfree(prev); + } + + return NULL; +} + +/** + * bfq_group_chain_link - link an allocated group chain to a cgroup + * hierarchy. + * @bfqd: the queue descriptor. + * @css: the leaf cgroup_subsys_state to start from. + * @leaf: the leaf group (to be associated to @cgroup). + * + * Try to link a chain of groups to a cgroup hierarchy, connecting the + * nodes bottom-up, so we can be sure that when we find a cgroup in the + * hierarchy that already as a group associated to @bfqd all the nodes + * in the path to the root cgroup have one too. + * + * On locking: the queue lock protects the hierarchy (there is a hierarchy + * per device) while the bfqio_cgroup lock protects the list of groups + * belonging to the same cgroup. + */ +static void bfq_group_chain_link(struct bfq_data *bfqd, + struct cgroup_subsys_state *css, + struct bfq_group *leaf) +{ + struct bfqio_cgroup *bgrp; + struct bfq_group *bfqg, *next, *prev = NULL; + unsigned long flags; + + assert_spin_locked(bfqd->queue->queue_lock); + + for (; css != NULL && leaf != NULL; css = css->parent) { + bgrp = css_to_bfqio(css); + next = leaf->bfqd; + + bfqg = bfqio_lookup_group(bgrp, bfqd); + BUG_ON(bfqg != NULL); + + spin_lock_irqsave(&bgrp->lock, flags); + + rcu_assign_pointer(leaf->bfqd, bfqd); + hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); + hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); + + spin_unlock_irqrestore(&bgrp->lock, flags); + + prev = leaf; + leaf = next; + } + + BUG_ON(css == NULL && leaf != NULL); + if (css != NULL && prev != NULL) { + bgrp = css_to_bfqio(css); + bfqg = bfqio_lookup_group(bgrp, bfqd); + bfq_group_set_parent(prev, bfqg); + } +} + +/** + * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. + * @bfqd: queue descriptor. + * @cgroup: cgroup being searched for. + * + * Return a group associated to @bfqd in @cgroup, allocating one if + * necessary. When a group is returned all the cgroups in the path + * to the root have a group associated to @bfqd. + * + * If the allocation fails, return the root group: this breaks guarantees + * but is a safe fallback. If this loss becomes a problem it can be + * mitigated using the equivalent weight (given by the product of the + * weights of the groups in the path from @group to the root) in the + * root scheduler. + * + * We allocate all the missing nodes in the path from the leaf cgroup + * to the root and we connect the nodes only after all the allocations + * have been successful. + */ +static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, + struct cgroup_subsys_state *css) +{ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); + struct bfq_group *bfqg; + + bfqg = bfqio_lookup_group(bgrp, bfqd); + if (bfqg != NULL) + return bfqg; + + bfqg = bfq_group_chain_alloc(bfqd, css); + if (bfqg != NULL) + bfq_group_chain_link(bfqd, css, bfqg); + else + bfqg = bfqd->root_group; + + return bfqg; +} + +/** + * bfq_bfqq_move - migrate @bfqq to @bfqg. + * @bfqd: queue descriptor. + * @bfqq: the queue to move. + * @entity: @bfqq's entity. + * @bfqg: the group to move to. + * + * Move @bfqq to @bfqg, deactivating it from its old group and reactivating + * it on the new one. Avoid putting the entity on the old group idle tree. + * + * Must be called under the queue lock; the cgroup owning @bfqg must + * not disappear (by now this just means that we are called under + * rcu_read_lock()). + */ +static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_entity *entity, struct bfq_group *bfqg) +{ + int busy, resume; + + busy = bfq_bfqq_busy(bfqq); + resume = !RB_EMPTY_ROOT(&bfqq->sort_list); + + BUG_ON(resume && !entity->on_st); + BUG_ON(busy && !resume && entity->on_st && + bfqq != bfqd->in_service_queue); + + if (busy) { + BUG_ON(atomic_read(&bfqq->ref) < 2); + + if (!resume) + bfq_del_bfqq_busy(bfqd, bfqq, 0); + else + bfq_deactivate_bfqq(bfqd, bfqq, 0); + } else if (entity->on_st) + bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + + /* + * Here we use a reference to bfqg. We don't need a refcounter + * as the cgroup reference will not be dropped, so that its + * destroy() callback will not be invoked. + */ + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; + + if (busy && resume) + bfq_activate_bfqq(bfqd, bfqq); + + if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); +} + +/** + * __bfq_bic_change_cgroup - move @bic to @cgroup. + * @bfqd: the queue descriptor. + * @bic: the bic to move. + * @cgroup: the cgroup to move to. + * + * Move bic to cgroup, assuming that bfqd->queue is locked; the caller + * has to make sure that the reference to cgroup is valid across the call. + * + * NOTE: an alternative approach might have been to store the current + * cgroup in bfqq and getting a reference to it, reducing the lookup + * time here, at the price of slightly more complex code. + */ +static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct cgroup_subsys_state *css) +{ + struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); + struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); + struct bfq_entity *entity; + struct bfq_group *bfqg; + struct bfqio_cgroup *bgrp; + + bgrp = css_to_bfqio(css); + + bfqg = bfq_find_alloc_group(bfqd, css); + if (async_bfqq != NULL) { + entity = &async_bfqq->entity; + + if (entity->sched_data != &bfqg->sched_data) { + bic_set_bfqq(bic, NULL, 0); + bfq_log_bfqq(bfqd, async_bfqq, + "bic_change_group: %p %d", + async_bfqq, atomic_read(&async_bfqq->ref)); + bfq_put_queue(async_bfqq); + } + } + + if (sync_bfqq != NULL) { + entity = &sync_bfqq->entity; + if (entity->sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); + } + + return bfqg; +} + +/** + * bfq_bic_change_cgroup - move @bic to @cgroup. + * @bic: the bic being migrated. + * @cgroup: the destination cgroup. + * + * When the task owning @bic is moved to @cgroup, @bic is immediately + * moved into its new parent group. + */ +static void bfq_bic_change_cgroup(struct bfq_io_cq *bic, + struct cgroup_subsys_state *css) +{ + struct bfq_data *bfqd; + unsigned long uninitialized_var(flags); + + bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), + &flags); + if (bfqd != NULL) { + __bfq_bic_change_cgroup(bfqd, bic, css); + bfq_put_bfqd_unlock(bfqd, &flags); + } +} + +/** + * bfq_bic_update_cgroup - update the cgroup of @bic. + * @bic: the @bic to update. + * + * Make sure that @bic is enqueued in the cgroup of the current task. + * We need this in addition to moving bics during the cgroup attach + * phase because the task owning @bic could be at its first disk + * access or we may end up in the root cgroup as the result of a + * memory allocation failure and here we try to move to the right + * group. + * + * Must be called under the queue lock. It is safe to use the returned + * value even after the rcu_read_unlock() as the migration/destruction + * paths act under the queue lock too. IOW it is impossible to race with + * group migration/destruction and end up with an invalid group as: + * a) here cgroup has not yet been destroyed, nor its destroy callback + * has started execution, as current holds a reference to it, + * b) if it is destroyed after rcu_read_unlock() [after current is + * migrated to a different cgroup] its attach() callback will have + * taken care of remove all the references to the old cgroup data. + */ +static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic) +{ + struct bfq_data *bfqd = bic_to_bfqd(bic); + struct bfq_group *bfqg; + struct cgroup_subsys_state *css; + + BUG_ON(bfqd == NULL); + + rcu_read_lock(); + css = task_css(current, bfqio_cgrp_id); + bfqg = __bfq_bic_change_cgroup(bfqd, bic, css); + rcu_read_unlock(); + + return bfqg; +} + +/** + * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. + * @st: the service tree being flushed. + */ +static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) +{ + struct bfq_entity *entity = st->first_idle; + + for (; entity != NULL; entity = st->first_idle) + __bfq_deactivate_entity(entity, 0); +} + +/** + * bfq_reparent_leaf_entity - move leaf entity to the root_group. + * @bfqd: the device data structure with the root group. + * @entity: the entity to move. + */ +static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(bfqq == NULL); + bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); + return; +} + +/** + * bfq_reparent_active_entities - move to the root group all active + * entities. + * @bfqd: the device data structure with the root group. + * @bfqg: the group to move from. + * @st: the service tree with the entities. + * + * Needs queue_lock to be taken and reference to be valid over the call. + */ +static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, + struct bfq_group *bfqg, + struct bfq_service_tree *st) +{ + struct rb_root *active = &st->active; + struct bfq_entity *entity = NULL; + + if (!RB_EMPTY_ROOT(&st->active)) + entity = bfq_entity_of(rb_first(active)); + + for (; entity != NULL; entity = bfq_entity_of(rb_first(active))) + bfq_reparent_leaf_entity(bfqd, entity); + + if (bfqg->sched_data.in_service_entity != NULL) + bfq_reparent_leaf_entity(bfqd, + bfqg->sched_data.in_service_entity); + + return; +} + +/** + * bfq_destroy_group - destroy @bfqg. + * @bgrp: the bfqio_cgroup containing @bfqg. + * @bfqg: the group being destroyed. + * + * Destroy @bfqg, making sure that it is not referenced from its parent. + */ +static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) +{ + struct bfq_data *bfqd; + struct bfq_service_tree *st; + struct bfq_entity *entity = bfqg->my_entity; + unsigned long uninitialized_var(flags); + int i; + + hlist_del(&bfqg->group_node); + + /* + * Empty all service_trees belonging to this group before + * deactivating the group itself. + */ + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { + st = bfqg->sched_data.service_tree + i; + + /* + * The idle tree may still contain bfq_queues belonging + * to exited task because they never migrated to a different + * cgroup from the one being destroyed now. No one else + * can access them so it's safe to act without any lock. + */ + bfq_flush_idle_tree(st); + + /* + * It may happen that some queues are still active + * (busy) upon group destruction (if the corresponding + * processes have been forced to terminate). We move + * all the leaf entities corresponding to these queues + * to the root_group. + * Also, it may happen that the group has an entity + * in service, which is disconnected from the active + * tree: it must be moved, too. + * There is no need to put the sync queues, as the + * scheduler has taken no reference. + */ + bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); + if (bfqd != NULL) { + bfq_reparent_active_entities(bfqd, bfqg, st); + bfq_put_bfqd_unlock(bfqd, &flags); + } + BUG_ON(!RB_EMPTY_ROOT(&st->active)); + BUG_ON(!RB_EMPTY_ROOT(&st->idle)); + } + BUG_ON(bfqg->sched_data.next_in_service != NULL); + BUG_ON(bfqg->sched_data.in_service_entity != NULL); + + /* + * We may race with device destruction, take extra care when + * dereferencing bfqg->bfqd. + */ + bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); + if (bfqd != NULL) { + hlist_del(&bfqg->bfqd_node); + __bfq_deactivate_entity(entity, 0); + bfq_put_async_queues(bfqd, bfqg); + bfq_put_bfqd_unlock(bfqd, &flags); + } + BUG_ON(entity->tree != NULL); + + /* + * No need to defer the kfree() to the end of the RCU grace + * period: we are called from the destroy() callback of our + * cgroup, so we can be sure that no one is a) still using + * this cgroup or b) doing lookups in it. + */ + kfree(bfqg); +} + +static void bfq_end_wr_async(struct bfq_data *bfqd) +{ + struct hlist_node *tmp; + struct bfq_group *bfqg; + + hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) + bfq_end_wr_async_queues(bfqd, bfqg); + bfq_end_wr_async_queues(bfqd, bfqd->root_group); +} + +/** + * bfq_disconnect_groups - disconnect @bfqd from all its groups. + * @bfqd: the device descriptor being exited. + * + * When the device exits we just make sure that no lookup can return + * the now unused group structures. They will be deallocated on cgroup + * destruction. + */ +static void bfq_disconnect_groups(struct bfq_data *bfqd) +{ + struct hlist_node *tmp; + struct bfq_group *bfqg; + + bfq_log(bfqd, "disconnect_groups beginning"); + hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) { + hlist_del(&bfqg->bfqd_node); + + __bfq_deactivate_entity(bfqg->my_entity, 0); + + /* + * Don't remove from the group hash, just set an + * invalid key. No lookups can race with the + * assignment as bfqd is being destroyed; this + * implies also that new elements cannot be added + * to the list. + */ + rcu_assign_pointer(bfqg->bfqd, NULL); + + bfq_log(bfqd, "disconnect_groups: put async for group %p", + bfqg); + bfq_put_async_queues(bfqd, bfqg); + } +} + +static inline void bfq_free_root_group(struct bfq_data *bfqd) +{ + struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; + struct bfq_group *bfqg = bfqd->root_group; + + bfq_put_async_queues(bfqd, bfqg); + + spin_lock_irq(&bgrp->lock); + hlist_del_rcu(&bfqg->group_node); + spin_unlock_irq(&bgrp->lock); + + /* + * No need to synchronize_rcu() here: since the device is gone + * there cannot be any read-side access to its root_group. + */ + kfree(bfqg); +} + +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) +{ + struct bfq_group *bfqg; + struct bfqio_cgroup *bgrp; + int i; + + bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node); + if (bfqg == NULL) + return NULL; + + bfqg->entity.parent = NULL; + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + + bgrp = &bfqio_root_cgroup; + spin_lock_irq(&bgrp->lock); + rcu_assign_pointer(bfqg->bfqd, bfqd); + hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); + spin_unlock_irq(&bgrp->lock); + + return bfqg; +} + +#define SHOW_FUNCTION(__VAR) \ +static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \ + struct cftype *cftype) \ +{ \ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ + u64 ret = -ENODEV; \ + \ + mutex_lock(&bfqio_mutex); \ + if (bfqio_is_removed(bgrp)) \ + goto out_unlock; \ + \ + spin_lock_irq(&bgrp->lock); \ + ret = bgrp->__VAR; \ + spin_unlock_irq(&bgrp->lock); \ + \ +out_unlock: \ + mutex_unlock(&bfqio_mutex); \ + return ret; \ +} + +SHOW_FUNCTION(weight); +SHOW_FUNCTION(ioprio); +SHOW_FUNCTION(ioprio_class); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ +static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\ + struct cftype *cftype, \ + u64 val) \ +{ \ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ + struct bfq_group *bfqg; \ + int ret = -EINVAL; \ + \ + if (val < (__MIN) || val > (__MAX)) \ + return ret; \ + \ + ret = -ENODEV; \ + mutex_lock(&bfqio_mutex); \ + if (bfqio_is_removed(bgrp)) \ + goto out_unlock; \ + ret = 0; \ + \ + spin_lock_irq(&bgrp->lock); \ + bgrp->__VAR = (unsigned short)val; \ + hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \ + /* \ + * Setting the ioprio_changed flag of the entity \ + * to 1 with new_##__VAR == ##__VAR would re-set \ + * the value of the weight to its ioprio mapping. \ + * Set the flag only if necessary. \ + */ \ + if ((unsigned short)val != bfqg->entity.new_##__VAR) { \ + bfqg->entity.new_##__VAR = (unsigned short)val; \ + /* \ + * Make sure that the above new value has been \ + * stored in bfqg->entity.new_##__VAR before \ + * setting the ioprio_changed flag. In fact, \ + * this flag may be read asynchronously (in \ + * critical sections protected by a different \ + * lock than that held here), and finding this \ + * flag set may cause the execution of the code \ + * for updating parameters whose value may \ + * depend also on bfqg->entity.new_##__VAR (in \ + * __bfq_entity_update_weight_prio). \ + * This barrier makes sure that the new value \ + * of bfqg->entity.new_##__VAR is correctly \ + * seen in that code. \ + */ \ + smp_wmb(); \ + bfqg->entity.ioprio_changed = 1; \ + } \ + } \ + spin_unlock_irq(&bgrp->lock); \ + \ +out_unlock: \ + mutex_unlock(&bfqio_mutex); \ + return ret; \ +} + +STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); +STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); +STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); +#undef STORE_FUNCTION + +static struct cftype bfqio_files[] = { + { + .name = "weight", + .read_u64 = bfqio_cgroup_weight_read, + .write_u64 = bfqio_cgroup_weight_write, + }, + { + .name = "ioprio", + .read_u64 = bfqio_cgroup_ioprio_read, + .write_u64 = bfqio_cgroup_ioprio_write, + }, + { + .name = "ioprio_class", + .read_u64 = bfqio_cgroup_ioprio_class_read, + .write_u64 = bfqio_cgroup_ioprio_class_write, + }, + { }, /* terminate */ +}; + +static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state + *parent_css) +{ + struct bfqio_cgroup *bgrp; + + if (parent_css != NULL) { + bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); + if (bgrp == NULL) + return ERR_PTR(-ENOMEM); + } else + bgrp = &bfqio_root_cgroup; + + spin_lock_init(&bgrp->lock); + INIT_HLIST_HEAD(&bgrp->group_data); + bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; + bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; + + return &bgrp->css; +} + +/* + * We cannot support shared io contexts, as we have no means to support + * two tasks with the same ioc in two different groups without major rework + * of the main bic/bfqq data structures. By now we allow a task to change + * its cgroup only if it's the only owner of its ioc; the drawback of this + * behavior is that a group containing a task that forked using CLONE_IO + * will not be destroyed until the tasks sharing the ioc die. + */ +static int bfqio_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct io_context *ioc; + int ret = 0; + + cgroup_taskset_for_each(task, tset) { + /* + * task_lock() is needed to avoid races with + * exit_io_context() + */ + task_lock(task); + ioc = task->io_context; + if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) + /* + * ioc == NULL means that the task is either too + * young or exiting: if it has still no ioc the + * ioc can't be shared, if the task is exiting the + * attach will fail anyway, no matter what we + * return here. + */ + ret = -EINVAL; + task_unlock(task); + if (ret) + break; + } + + return ret; +} + +static void bfqio_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct io_context *ioc; + struct io_cq *icq; + + /* + * IMPORTANT NOTE: The move of more than one process at a time to a + * new group has not yet been tested. + */ + cgroup_taskset_for_each(task, tset) { + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { + /* + * Handle cgroup change here. + */ + rcu_read_lock(); + hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node) + if (!strncmp( + icq->q->elevator->type->elevator_name, + "bfq", ELV_NAME_MAX)) + bfq_bic_change_cgroup(icq_to_bic(icq), + css); + rcu_read_unlock(); + put_io_context(ioc); + } + } +} + +static void bfqio_destroy(struct cgroup_subsys_state *css) +{ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); + struct hlist_node *tmp; + struct bfq_group *bfqg; + + /* + * Since we are destroying the cgroup, there are no more tasks + * referencing it, and all the RCU grace periods that may have + * referenced it are ended (as the destruction of the parent + * cgroup is RCU-safe); bgrp->group_data will not be accessed by + * anything else and we don't need any synchronization. + */ + hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node) + bfq_destroy_group(bgrp, bfqg); + + BUG_ON(!hlist_empty(&bgrp->group_data)); + + kfree(bgrp); +} + +static int bfqio_css_online(struct cgroup_subsys_state *css) +{ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); + + mutex_lock(&bfqio_mutex); + bgrp->online = true; + mutex_unlock(&bfqio_mutex); + + return 0; +} + +static void bfqio_css_offline(struct cgroup_subsys_state *css) +{ + struct bfqio_cgroup *bgrp = css_to_bfqio(css); + + mutex_lock(&bfqio_mutex); + bgrp->online = false; + mutex_unlock(&bfqio_mutex); +} + +struct cgroup_subsys bfqio_cgrp_subsys = { + .css_alloc = bfqio_create, + .css_online = bfqio_css_online, + .css_offline = bfqio_css_offline, + .can_attach = bfqio_can_attach, + .attach = bfqio_attach, + .css_free = bfqio_destroy, + .legacy_cftypes = bfqio_files, +}; +#else +static inline void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) +{ + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class; + entity->sched_data = &bfqg->sched_data; +} + +static inline struct bfq_group * +bfq_bic_update_cgroup(struct bfq_io_cq *bic) +{ + struct bfq_data *bfqd = bic_to_bfqd(bic); + return bfqd->root_group; +} + +static inline void bfq_bfqq_move(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_entity *entity, + struct bfq_group *bfqg) +{ +} + +static void bfq_end_wr_async(struct bfq_data *bfqd) +{ + bfq_end_wr_async_queues(bfqd, bfqd->root_group); +} + +static inline void bfq_disconnect_groups(struct bfq_data *bfqd) +{ + bfq_put_async_queues(bfqd, bfqd->root_group); +} + +static inline void bfq_free_root_group(struct bfq_data *bfqd) +{ + kfree(bfqd->root_group); +} + +static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) +{ + struct bfq_group *bfqg; + int i; + + bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); + if (bfqg == NULL) + return NULL; + + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + + return bfqg; +} +#endif diff -Nur linux-4.2/block/bfq-ioc.c linux-4.2-pck/block/bfq-ioc.c --- linux-4.2/block/bfq-ioc.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/block/bfq-ioc.c 2015-09-08 00:48:22.215031180 -0300 @@ -0,0 +1,36 @@ +/* + * BFQ: I/O context handling. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + */ + +/** + * icq_to_bic - convert iocontext queue structure to bfq_io_cq. + * @icq: the iocontext queue. + */ +static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq) +{ + /* bic->icq is the first member, %NULL will convert to %NULL */ + return container_of(icq, struct bfq_io_cq, icq); +} + +/** + * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. + * @bfqd: the lookup key. + * @ioc: the io_context of the process doing I/O. + * + * Queue lock must be held. + */ +static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + struct io_context *ioc) +{ + if (ioc) + return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); + return NULL; +} diff -Nur linux-4.2/block/bfq-iosched.c linux-4.2-pck/block/bfq-iosched.c --- linux-4.2/block/bfq-iosched.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/block/bfq-iosched.c 2015-09-08 00:48:22.218364355 -0300 @@ -0,0 +1,4218 @@ +/* + * Budget Fair Queueing (BFQ) disk scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. + * + * BFQ is a proportional-share storage-I/O scheduling algorithm based on + * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, + * measured in number of sectors, to processes instead of time slices. The + * device is not granted to the in-service process for a given time slice, + * but until it has exhausted its assigned budget. This change from the time + * to the service domain allows BFQ to distribute the device throughput + * among processes as desired, without any distortion due to ZBR, workload + * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, + * called B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated to processes. Thanks to the + * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to + * I/O-bound processes issuing sequential requests (to boost the + * throughput), and yet guarantee a low latency to interactive and soft + * real-time applications. + * + * BFQ is described in [1], where also a reference to the initial, more + * theoretical paper on BFQ can be found. The interested reader can find + * in the latter paper full details on the main algorithm, as well as + * formulas of the guarantees and formal proofs of all the properties. + * With respect to the version of BFQ presented in these papers, this + * implementation adds a few more heuristics, such as the one that + * guarantees a low latency to soft real-time applications, and a + * hierarchical extension based on H-WF2Q+. + * + * B-WF2Q+ is based on WF2Q+, that is described in [2], together with + * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) + * complexity derives from the one introduced with EEVDF in [3]. + * + * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness + * with the BFQ Disk I/O Scheduler'', + * Proceedings of the 5th Annual International Systems and Storage + * Conference (SYSTOR '12), June 2012. + * + * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf + * + * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing + * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, + * Oct 1997. + * + * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz + * + * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline + * First: A Flexible and Accurate Mechanism for Proportional Share + * Resource Allocation,'' technical report. + * + * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "bfq.h" +#include "blk.h" + +/* Expiration time of sync (0) and async (1) requests, in jiffies. */ +static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; + +/* Maximum backwards seek, in KiB. */ +static const int bfq_back_max = 16 * 1024; + +/* Penalty of a backwards seek, in number of sectors. */ +static const int bfq_back_penalty = 2; + +/* Idling period duration, in jiffies. */ +static int bfq_slice_idle = HZ / 125; + +/* Default maximum budget values, in sectors and number of requests. */ +static const int bfq_default_max_budget = 16 * 1024; +static const int bfq_max_budget_async_rq = 4; + +/* + * Async to sync throughput distribution is controlled as follows: + * when an async request is served, the entity is charged the number + * of sectors of the request, multiplied by the factor below + */ +static const int bfq_async_charge_factor = 10; + +/* Default timeout values, in jiffies, approximating CFQ defaults. */ +static const int bfq_timeout_sync = HZ / 8; +static int bfq_timeout_async = HZ / 25; + +struct kmem_cache *bfq_pool; + +/* Below this threshold (in ms), we consider thinktime immediate. */ +#define BFQ_MIN_TT 2 + +/* hw_tag detection: parallel requests threshold and min samples needed. */ +#define BFQ_HW_QUEUE_THRESHOLD 4 +#define BFQ_HW_QUEUE_SAMPLES 32 + +#define BFQQ_SEEK_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) + +/* Min samples used for peak rate estimation (for autotuning). */ +#define BFQ_PEAK_RATE_SAMPLES 32 + +/* Shift used for peak rate fixed precision calculations. */ +#define BFQ_RATE_SHIFT 16 + +/* + * By default, BFQ computes the duration of the weight raising for + * interactive applications automatically, using the following formula: + * duration = (R / r) * T, where r is the peak rate of the device, and + * R and T are two reference parameters. + * In particular, R is the peak rate of the reference device (see below), + * and T is a reference time: given the systems that are likely to be + * installed on the reference device according to its speed class, T is + * about the maximum time needed, under BFQ and while reading two files in + * parallel, to load typical large applications on these systems. + * In practice, the slower/faster the device at hand is, the more/less it + * takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to interactive + * applications. + * + * BFQ uses four different reference pairs (R, T), depending on: + * . whether the device is rotational or non-rotational; + * . whether the device is slow, such as old or portable HDDs, as well as + * SD cards, or fast, such as newer HDDs and SSDs. + * + * The device's speed class is dynamically (re)detected in + * bfq_update_peak_rate() every time the estimated peak rate is updated. + * + * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] + * are the reference values for a slow/fast rotational device, whereas + * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for + * a slow/fast non-rotational device. Finally, device_speed_thresh are the + * thresholds used to switch between speed classes. + * Both the reference peak rates and the thresholds are measured in + * sectors/usec, left-shifted by BFQ_RATE_SHIFT. + */ +static int R_slow[2] = {1536, 10752}; +static int R_fast[2] = {17415, 34791}; +/* + * To improve readability, a conversion function is used to initialize the + * following arrays, which entails that they can be initialized only in a + * function. + */ +static int T_slow[2]; +static int T_fast[2]; +static int device_speed_thresh[2]; + +#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) +#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) + +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd); + +#include "bfq-ioc.c" +#include "bfq-sched.c" +#include "bfq-cgroup.c" + +#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_IDLE) +#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_RT) + +#define bfq_sample_valid(samples) ((samples) > 80) + +/* + * The following macro groups conditions that need to be evaluated when + * checking if existing queues and groups form a symmetric scenario + * and therefore idling can be reduced or disabled for some of the + * queues. See the comment to the function bfq_bfqq_must_not_expire() + * for further details. + */ +#ifdef CONFIG_CGROUP_BFQIO +#define symmetric_scenario (!bfqd->active_numerous_groups && \ + !bfq_differentiated_weights(bfqd)) +#else +#define symmetric_scenario (!bfq_differentiated_weights(bfqd)) +#endif + +/* + * We regard a request as SYNC, if either it's a read or has the SYNC bit + * set (in which case it could also be a direct WRITE). + */ +static inline int bfq_bio_sync(struct bio *bio) +{ + if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) + return 1; + + return 0; +} + +/* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. + */ +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) +{ + if (bfqd->queued != 0) { + bfq_log(bfqd, "schedule dispatch"); + kblockd_schedule_work(&bfqd->unplug_work); + } +} + +/* + * Lifted from AS - choose which of rq1 and rq2 that is best served now. + * We choose the request that is closesr to the head right now. Distance + * behind the head is penalized and only allowed to a certain extent. + */ +static struct request *bfq_choose_req(struct bfq_data *bfqd, + struct request *rq1, + struct request *rq2, + sector_t last) +{ + sector_t s1, s2, d1 = 0, d2 = 0; + unsigned long back_max; +#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ +#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ + unsigned wrap = 0; /* bit mask: requests behind the disk head? */ + + if (rq1 == NULL || rq1 == rq2) + return rq2; + if (rq2 == NULL) + return rq1; + + if (rq_is_sync(rq1) && !rq_is_sync(rq2)) + return rq1; + else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) + return rq2; + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) + return rq1; + else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) + return rq2; + + s1 = blk_rq_pos(rq1); + s2 = blk_rq_pos(rq2); + + /* + * By definition, 1KiB is 2 sectors. + */ + back_max = bfqd->bfq_back_max * 2; + + /* + * Strict one way elevator _except_ in the case where we allow + * short backward seeks which are biased as twice the cost of a + * similar forward seek. + */ + if (s1 >= last) + d1 = s1 - last; + else if (s1 + back_max >= last) + d1 = (last - s1) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ1_WRAP; + + if (s2 >= last) + d2 = s2 - last; + else if (s2 + back_max >= last) + d2 = (last - s2) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ2_WRAP; + + /* Found required data */ + + /* + * By doing switch() on the bit mask "wrap" we avoid having to + * check two variables for all permutations: --> faster! + */ + switch (wrap) { + case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ + if (d1 < d2) + return rq1; + else if (d2 < d1) + return rq2; + else { + if (s1 >= s2) + return rq1; + else + return rq2; + } + + case BFQ_RQ2_WRAP: + return rq1; + case BFQ_RQ1_WRAP: + return rq2; + case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ + default: + /* + * Since both rqs are wrapped, + * start with the one that's further behind head + * (--> only *one* back seek required), + * since back seek takes more time than forward. + */ + if (s1 <= s2) + return rq1; + else + return rq2; + } +} + +static struct bfq_queue * +bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + sector_t sector, struct rb_node **ret_parent, + struct rb_node ***rb_link) +{ + struct rb_node **p, *parent; + struct bfq_queue *bfqq = NULL; + + parent = NULL; + p = &root->rb_node; + while (*p) { + struct rb_node **n; + + parent = *p; + bfqq = rb_entry(parent, struct bfq_queue, pos_node); + + /* + * Sort strictly based on sector. Smallest to the left, + * largest to the right. + */ + if (sector > blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_right; + else if (sector < blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_left; + else + break; + p = n; + bfqq = NULL; + } + + *ret_parent = parent; + if (rb_link) + *rb_link = p; + + bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", + (long long unsigned)sector, + bfqq != NULL ? bfqq->pid : 0); + + return bfqq; +} + +static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct rb_node **p, *parent; + struct bfq_queue *__bfqq; + + if (bfqq->pos_root != NULL) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + + if (bfq_class_idle(bfqq)) + return; + if (!bfqq->next_rq) + return; + + bfqq->pos_root = &bfqd->rq_pos_tree; + __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, + blk_rq_pos(bfqq->next_rq), &parent, &p); + if (__bfqq == NULL) { + rb_link_node(&bfqq->pos_node, parent, p); + rb_insert_color(&bfqq->pos_node, bfqq->pos_root); + } else + bfqq->pos_root = NULL; +} + +/* + * Tell whether there are active queues or groups with differentiated weights. + */ +static inline bool bfq_differentiated_weights(struct bfq_data *bfqd) +{ + /* + * For weights to differ, at least one of the trees must contain + * at least two nodes. + */ + return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && + (bfqd->queue_weights_tree.rb_node->rb_left || + bfqd->queue_weights_tree.rb_node->rb_right) +#ifdef CONFIG_CGROUP_BFQIO + ) || + (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && + (bfqd->group_weights_tree.rb_node->rb_left || + bfqd->group_weights_tree.rb_node->rb_right) +#endif + ); +} + +/* + * If the weight-counter tree passed as input contains no counter for + * the weight of the input entity, then add that counter; otherwise just + * increment the existing counter. + * + * Note that weight-counter trees contain few nodes in mostly symmetric + * scenarios. For example, if all queues have the same weight, then the + * weight-counter tree for the queues may contain at most one node. + * This holds even if low_latency is on, because weight-raised queues + * are not inserted in the tree. + * In most scenarios, the rate at which nodes are created/destroyed + * should be low too. + */ +static void bfq_weights_tree_add(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* + * Do not insert if the entity is already associated with a + * counter, which happens if: + * 1) the entity is associated with a queue, + * 2) a request arrival has caused the queue to become both + * non-weight-raised, and hence change its weight, and + * backlogged; in this respect, each of the two events + * causes an invocation of this function, + * 3) this is the invocation of this function caused by the + * second event. This second invocation is actually useless, + * and we handle this fact by exiting immediately. More + * efficient or clearer solutions might possibly be adopted. + */ + if (entity->weight_counter) + return; + + while (*new) { + struct bfq_weight_counter *__counter = container_of(*new, + struct bfq_weight_counter, + weights_node); + parent = *new; + + if (entity->weight == __counter->weight) { + entity->weight_counter = __counter; + goto inc_counter; + } + if (entity->weight < __counter->weight) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), + GFP_ATOMIC); + entity->weight_counter->weight = entity->weight; + rb_link_node(&entity->weight_counter->weights_node, parent, new); + rb_insert_color(&entity->weight_counter->weights_node, root); + +inc_counter: + entity->weight_counter->num_active++; +} + +/* + * Decrement the weight counter associated with the entity, and, if the + * counter reaches 0, remove the counter from the tree. + * See the comments to the function bfq_weights_tree_add() for considerations + * about overhead. + */ +static void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + if (!entity->weight_counter) + return; + + BUG_ON(RB_EMPTY_ROOT(root)); + BUG_ON(entity->weight_counter->weight != entity->weight); + + BUG_ON(!entity->weight_counter->num_active); + entity->weight_counter->num_active--; + if (entity->weight_counter->num_active > 0) + goto reset_entity_pointer; + + rb_erase(&entity->weight_counter->weights_node, root); + kfree(entity->weight_counter); + +reset_entity_pointer: + entity->weight_counter = NULL; +} + +static struct request *bfq_find_next_rq(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *last) +{ + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct request *next = NULL, *prev = NULL; + + BUG_ON(RB_EMPTY_NODE(&last->rb_node)); + + if (rbprev != NULL) + prev = rb_entry_rq(rbprev); + + if (rbnext != NULL) + next = rb_entry_rq(rbnext); + else { + rbnext = rb_first(&bfqq->sort_list); + if (rbnext && rbnext != &last->rb_node) + next = rb_entry_rq(rbnext); + } + + return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); +} + +/* see the definition of bfq_async_charge_factor for details */ +static inline unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) +{ + return blk_rq_sectors(rq) * + (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * + bfq_async_charge_factor)); +} + +/** + * bfq_updated_next_req - update the queue after a new next_rq selection. + * @bfqd: the device data the queue belongs to. + * @bfqq: the queue to update. + * + * If the first request of a queue changes we make sure that the queue + * has enough budget to serve at least its first request (if the + * request has grown). We do this because if the queue has not enough + * budget for its first request, it has to go through two dispatch + * rounds to actually get it dispatched. + */ +static void bfq_updated_next_req(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + struct request *next_rq = bfqq->next_rq; + unsigned long new_budget; + + if (next_rq == NULL) + return; + + if (bfqq == bfqd->in_service_queue) + /* + * In order not to break guarantees, budgets cannot be + * changed after an entity has been selected. + */ + return; + + BUG_ON(entity->tree != &st->active); + BUG_ON(entity == entity->sched_data->in_service_entity); + + new_budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + if (entity->budget != new_budget) { + entity->budget = new_budget; + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", + new_budget); + bfq_activate_bfqq(bfqd, bfqq); + } +} + +static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd) +{ + u64 dur; + + if (bfqd->bfq_wr_max_time > 0) + return bfqd->bfq_wr_max_time; + + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); + + return dur; +} + +static inline unsigned +bfq_bfqq_cooperations(struct bfq_queue *bfqq) +{ + return bfqq->bic ? bfqq->bic->cooperations : 0; +} + +static inline void +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +{ + if (bic->saved_idle_window) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); + if (bic->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); + else + bfq_clear_bfqq_IO_bound(bfqq); + /* Assuming that the flag in_large_burst is already correctly set */ + if (bic->wr_time_left && bfqq->bfqd->low_latency && + !bfq_bfqq_in_large_burst(bfqq) && + bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { + /* + * Start a weight raising period with the duration given by + * the raising_time_left snapshot. + */ + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues++; + bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bic->wr_time_left; + bfqq->last_wr_start_finish = jiffies; + bfqq->entity.ioprio_changed = 1; + } + /* + * Clear wr_time_left to prevent bfq_bfqq_save_state() from + * getting confused about the queue's need of a weight-raising + * period. + */ + bic->wr_time_left = 0; +} + +/* Must be called with the queue_lock held. */ +static int bfqq_process_refs(struct bfq_queue *bfqq) +{ + int process_refs, io_refs; + + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; + process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; + BUG_ON(process_refs < 0); + return process_refs; +} + +/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ +static inline void bfq_reset_burst_list(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_queue *item; + struct hlist_node *n; + + hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) + hlist_del_init(&item->burst_list_node); + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); + bfqd->burst_size = 1; +} + +/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ +static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* Increment burst size to take into account also bfqq */ + bfqd->burst_size++; + + if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { + struct bfq_queue *pos, *bfqq_item; + struct hlist_node *n; + + /* + * Enough queues have been activated shortly after each + * other to consider this burst as large. + */ + bfqd->large_burst = true; + + /* + * We can now mark all queues in the burst list as + * belonging to a large burst. + */ + hlist_for_each_entry(bfqq_item, &bfqd->burst_list, + burst_list_node) + bfq_mark_bfqq_in_large_burst(bfqq_item); + bfq_mark_bfqq_in_large_burst(bfqq); + + /* + * From now on, and until the current burst finishes, any + * new queue being activated shortly after the last queue + * was inserted in the burst can be immediately marked as + * belonging to a large burst. So the burst list is not + * needed any more. Remove it. + */ + hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, + burst_list_node) + hlist_del_init(&pos->burst_list_node); + } else /* burst not yet large: add bfqq to the burst list */ + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); +} + +/* + * If many queues happen to become active shortly after each other, then, + * to help the processes associated to these queues get their job done as + * soon as possible, it is usually better to not grant either weight-raising + * or device idling to these queues. In this comment we describe, firstly, + * the reasons why this fact holds, and, secondly, the next function, which + * implements the main steps needed to properly mark these queues so that + * they can then be treated in a different way. + * + * As for the terminology, we say that a queue becomes active, i.e., + * switches from idle to backlogged, either when it is created (as a + * consequence of the arrival of an I/O request), or, if already existing, + * when a new request for the queue arrives while the queue is idle. + * Bursts of activations, i.e., activations of different queues occurring + * shortly after each other, are typically caused by services or applications + * that spawn or reactivate many parallel threads/processes. Examples are + * systemd during boot or git grep. + * + * These services or applications benefit mostly from a high throughput: + * the quicker the requests of the activated queues are cumulatively served, + * the sooner the target job of these queues gets completed. As a consequence, + * weight-raising any of these queues, which also implies idling the device + * for it, is almost always counterproductive: in most cases it just lowers + * throughput. + * + * On the other hand, a burst of activations may be also caused by the start + * of an application that does not consist in a lot of parallel I/O-bound + * threads. In fact, with a complex application, the burst may be just a + * consequence of the fact that several processes need to be executed to + * start-up the application. To start an application as quickly as possible, + * the best thing to do is to privilege the I/O related to the application + * with respect to all other I/O. Therefore, the best strategy to start as + * quickly as possible an application that causes a burst of activations is + * to weight-raise all the queues activated during the burst. This is the + * exact opposite of the best strategy for the other type of bursts. + * + * In the end, to take the best action for each of the two cases, the two + * types of bursts need to be distinguished. Fortunately, this seems + * relatively easy to do, by looking at the sizes of the bursts. In + * particular, we found a threshold such that bursts with a larger size + * than that threshold are apparently caused only by services or commands + * such as systemd or git grep. For brevity, hereafter we call just 'large' + * these bursts. BFQ *does not* weight-raise queues whose activations occur + * in a large burst. In addition, for each of these queues BFQ performs or + * does not perform idling depending on which choice boosts the throughput + * most. The exact choice depends on the device and request pattern at + * hand. + * + * Turning back to the next function, it implements all the steps needed + * to detect the occurrence of a large burst and to properly mark all the + * queues belonging to it (so that they can then be treated in a different + * way). This goal is achieved by maintaining a special "burst list" that + * holds, temporarily, the queues that belong to the burst in progress. The + * list is then used to mark these queues as belonging to a large burst if + * the burst does become large. The main steps are the following. + * + * . when the very first queue is activated, the queue is inserted into the + * list (as it could be the first queue in a possible burst) + * + * . if the current burst has not yet become large, and a queue Q that does + * not yet belong to the burst is activated shortly after the last time + * at which a new queue entered the burst list, then the function appends + * Q to the burst list + * + * . if, as a consequence of the previous step, the burst size reaches + * the large-burst threshold, then + * + * . all the queues in the burst list are marked as belonging to a + * large burst + * + * . the burst list is deleted; in fact, the burst list already served + * its purpose (keeping temporarily track of the queues in a burst, + * so as to be able to mark them as belonging to a large burst in the + * previous sub-step), and now is not needed any more + * + * . the device enters a large-burst mode + * + * . if a queue Q that does not belong to the burst is activated while + * the device is in large-burst mode and shortly after the last time + * at which a queue either entered the burst list or was marked as + * belonging to the current large burst, then Q is immediately marked + * as belonging to a large burst. + * + * . if a queue Q that does not belong to the burst is activated a while + * later, i.e., not shortly after, than the last time at which a queue + * either entered the burst list or was marked as belonging to the + * current large burst, then the current burst is deemed as finished and: + * + * . the large-burst mode is reset if set + * + * . the burst list is emptied + * + * . Q is inserted in the burst list, as Q may be the first queue + * in a possible new burst (then the burst list contains just Q + * after this step). + */ +static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool idle_for_long_time) +{ + /* + * If bfqq happened to be activated in a burst, but has been idle + * for at least as long as an interactive queue, then we assume + * that, in the overall I/O initiated in the burst, the I/O + * associated to bfqq is finished. So bfqq does not need to be + * treated as a queue belonging to a burst anymore. Accordingly, + * we reset bfqq's in_large_burst flag if set, and remove bfqq + * from the burst list if it's there. We do not decrement instead + * burst_size, because the fact that bfqq does not need to belong + * to the burst list any more does not invalidate the fact that + * bfqq may have been activated during the current burst. + */ + if (idle_for_long_time) { + hlist_del_init(&bfqq->burst_list_node); + bfq_clear_bfqq_in_large_burst(bfqq); + } + + /* + * If bfqq is already in the burst list or is part of a large + * burst, then there is nothing else to do. + */ + if (!hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq)) + return; + + /* + * If bfqq's activation happens late enough, then the current + * burst is finished, and related data structures must be reset. + * + * In this respect, consider the special case where bfqq is the very + * first queue being activated. In this case, last_ins_in_burst is + * not yet significant when we get here. But it is easy to verify + * that, whether or not the following condition is true, bfqq will + * end up being inserted into the burst list. In particular the + * list will happen to contain only bfqq. And this is exactly what + * has to happen, as bfqq may be the first queue in a possible + * burst. + */ + if (time_is_before_jiffies(bfqd->last_ins_in_burst + + bfqd->bfq_burst_interval)) { + bfqd->large_burst = false; + bfq_reset_burst_list(bfqd, bfqq); + return; + } + + /* + * If we get here, then bfqq is being activated shortly after the + * last queue. So, if the current burst is also large, we can mark + * bfqq as belonging to this large burst immediately. + */ + if (bfqd->large_burst) { + bfq_mark_bfqq_in_large_burst(bfqq); + return; + } + + /* + * If we get here, then a large-burst state has not yet been + * reached, but bfqq is being activated shortly after the last + * queue. Then we add bfqq to the burst. + */ + bfq_add_to_burst(bfqd, bfqq); +} + +static void bfq_add_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_entity *entity = &bfqq->entity; + struct bfq_data *bfqd = bfqq->bfqd; + struct request *next_rq, *prev; + unsigned long old_wr_coeff = bfqq->wr_coeff; + bool interactive = false; + + bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); + bfqq->queued[rq_is_sync(rq)]++; + bfqd->queued++; + + elv_rb_add(&bfqq->sort_list, rq); + + /* + * Check if this request is a better next-serve candidate. + */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); + BUG_ON(next_rq == NULL); + bfqq->next_rq = next_rq; + + /* + * Adjust priority tree position, if next_rq changes. + */ + if (prev != bfqq->next_rq) + bfq_rq_pos_tree_add(bfqd, bfqq); + + if (!bfq_bfqq_busy(bfqq)) { + bool soft_rt, coop_or_in_burst, + idle_for_long_time = time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); + + if (bfq_bfqq_sync(bfqq)) { + bool already_in_burst = + !hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq); + bfq_handle_burst(bfqd, bfqq, idle_for_long_time); + /* + * If bfqq was not already in the current burst, + * then, at this point, bfqq either has been + * added to the current burst or has caused the + * current burst to terminate. In particular, in + * the second case, bfqq has become the first + * queue in a possible new burst. + * In both cases last_ins_in_burst needs to be + * moved forward. + */ + if (!already_in_burst) + bfqd->last_ins_in_burst = jiffies; + } + + coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || + bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + !coop_or_in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); + interactive = !coop_or_in_burst && idle_for_long_time; + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (time_before(jiffies, + RQ_BIC(rq)->ttime.last_end_request + + bfqd->bfq_slice_idle)) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + } + + if (!bfqd->low_latency) + goto add_bfqq_busy; + + if (bfq_bfqq_just_split(bfqq)) + goto set_ioprio_changed; + + /* + * If the queue: + * - is not being boosted, + * - has been idle for enough time, + * - is not a sync queue or is linked to a bfq_io_cq (it is + * shared "for its nature" or it is not shared and its + * requests have not been redirected to a shared queue) + * start a weight-raising period. + */ + if (old_wr_coeff == 1 && (interactive || soft_rt) && + (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + else + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfq_log_bfqq(bfqd, bfqq, + "wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } else if (old_wr_coeff > 1) { + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + else if (coop_or_in_burst || + (bfqq->wr_cur_max_time == + bfqd->bfq_wr_rt_max_time && + !soft_rt)) { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq-> + wr_cur_max_time)); + } else if (time_before( + bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time, + jiffies + + bfqd->bfq_wr_rt_max_time) && + soft_rt) { + /* + * + * The remaining weight-raising time is lower + * than bfqd->bfq_wr_rt_max_time, which means + * that the application is enjoying weight + * raising either because deemed soft-rt in + * the near past, or because deemed interactive + * a long ago. + * In both cases, resetting now the current + * remaining weight-raising time for the + * application to the weight-raising duration + * for soft rt applications would not cause any + * latency increase for the application (as the + * new duration would be higher than the + * remaining time). + * + * In addition, the application is now meeting + * the requirements for being deemed soft rt. + * In the end we can correctly and safely + * (re)charge the weight-raising duration for + * the application with the weight-raising + * duration for soft rt applications. + * + * In particular, doing this recharge now, i.e., + * before the weight-raising period for the + * application finishes, reduces the probability + * of the following negative scenario: + * 1) the weight of a soft rt application is + * raised at startup (as for any newly + * created application), + * 2) since the application is not interactive, + * at a certain time weight-raising is + * stopped for the application, + * 3) at that time the application happens to + * still have pending requests, and hence + * is destined to not have a chance to be + * deemed soft rt before these requests are + * completed (see the comments to the + * function bfq_bfqq_softrt_next_start() + * for details on soft rt detection), + * 4) these pending requests experience a high + * latency because the application is not + * weight-raised while they are pending. + */ + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + } + } +set_ioprio_changed: + if (old_wr_coeff != bfqq->wr_coeff) + entity->ioprio_changed = 1; +add_bfqq_busy: + bfqq->last_idle_bklogged = jiffies; + bfqq->service_from_backlogged = 0; + bfq_clear_bfqq_softrt_update(bfqq); + bfq_add_bfqq_busy(bfqd, bfqq); + } else { + if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && + time_is_before_jiffies( + bfqq->last_wr_start_finish + + bfqd->bfq_wr_min_inter_arr_async)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + + bfqd->wr_busy_queues++; + entity->ioprio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "non-idle wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + if (prev != bfqq->next_rq) + bfq_updated_next_req(bfqd, bfqq); + } + + if (bfqd->low_latency && + (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) + bfqq->last_wr_start_finish = jiffies; +} + +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, + struct bio *bio) +{ + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (bic == NULL) + return NULL; + + bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); + if (bfqq != NULL) + return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); + + return NULL; +} + +static void bfq_activate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + bfqd->rq_in_driver++; + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", + (long long unsigned)bfqd->last_position); +} + +static inline void bfq_deactivate_request(struct request_queue *q, + struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + BUG_ON(bfqd->rq_in_driver == 0); + bfqd->rq_in_driver--; +} + +static void bfq_remove_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + const int sync = rq_is_sync(rq); + + if (bfqq->next_rq == rq) { + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); + bfq_updated_next_req(bfqd, bfqq); + } + + if (rq->queuelist.prev != &rq->queuelist) + list_del_init(&rq->queuelist); + BUG_ON(bfqq->queued[sync] == 0); + bfqq->queued[sync]--; + bfqd->queued--; + elv_rb_del(&bfqq->sort_list, rq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) + bfq_del_bfqq_busy(bfqd, bfqq, 1); + /* + * Remove queue from request-position tree as it is empty. + */ + if (bfqq->pos_root != NULL) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + } + + if (rq->cmd_flags & REQ_META) { + BUG_ON(bfqq->meta_pending == 0); + bfqq->meta_pending--; + } +} + +static int bfq_merge(struct request_queue *q, struct request **req, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *__rq; + + __rq = bfq_find_rq_fmerge(bfqd, bio); + if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { + *req = __rq; + return ELEVATOR_FRONT_MERGE; + } + + return ELEVATOR_NO_MERGE; +} + +static void bfq_merged_request(struct request_queue *q, struct request *req, + int type) +{ + if (type == ELEVATOR_FRONT_MERGE && + rb_prev(&req->rb_node) && + blk_rq_pos(req) < + blk_rq_pos(container_of(rb_prev(&req->rb_node), + struct request, rb_node))) { + struct bfq_queue *bfqq = RQ_BFQQ(req); + struct bfq_data *bfqd = bfqq->bfqd; + struct request *prev, *next_rq; + + /* Reposition request in its sort_list */ + elv_rb_del(&bfqq->sort_list, req); + elv_rb_add(&bfqq->sort_list, req); + /* Choose next request to be served for bfqq */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, + bfqd->last_position); + BUG_ON(next_rq == NULL); + bfqq->next_rq = next_rq; + /* + * If next_rq changes, update both the queue's budget to + * fit the new request and the queue's position in its + * rq_pos_tree. + */ + if (prev != bfqq->next_rq) { + bfq_updated_next_req(bfqd, bfqq); + bfq_rq_pos_tree_add(bfqd, bfqq); + } + } +} + +static void bfq_merged_requests(struct request_queue *q, struct request *rq, + struct request *next) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + + /* + * If next and rq belong to the same bfq_queue and next is older + * than rq, then reposition rq in the fifo (by substituting next + * with rq). Otherwise, if next and rq belong to different + * bfq_queues, never reposition rq: in fact, we would have to + * reposition it with respect to next's position in its own fifo, + * which would most certainly be too expensive with respect to + * the benefits. + */ + if (bfqq == next_bfqq && + !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && + time_before(next->fifo_time, rq->fifo_time)) { + list_del_init(&rq->queuelist); + list_replace_init(&next->queuelist, &rq->queuelist); + rq->fifo_time = next->fifo_time; + } + + if (bfqq->next_rq == next) + bfqq->next_rq = rq; + + bfq_remove_request(next); +} + +/* Must be called with bfqq != NULL */ +static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq) +{ + BUG_ON(bfqq == NULL); + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues--; + bfqq->wr_coeff = 1; + bfqq->wr_cur_max_time = 0; + /* Trigger a weight change on the next activation of the queue */ + bfqq->entity.ioprio_changed = 1; +} + +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + if (bfqg->async_bfqq[i][j] != NULL) + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); + if (bfqg->async_idle_bfqq != NULL) + bfq_bfqq_end_wr(bfqg->async_idle_bfqq); +} + +static void bfq_end_wr(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + + spin_lock_irq(bfqd->queue->queue_lock); + + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + bfq_end_wr_async(bfqd); + + spin_unlock_irq(bfqd->queue->queue_lock); +} + +static inline sector_t bfq_io_struct_pos(void *io_struct, bool request) +{ + if (request) + return blk_rq_pos(io_struct); + else + return ((struct bio *)io_struct)->bi_iter.bi_sector; +} + +static inline sector_t bfq_dist_from(sector_t pos1, + sector_t pos2) +{ + if (pos1 >= pos2) + return pos1 - pos2; + else + return pos2 - pos1; +} + +static inline int bfq_rq_close_to_sector(void *io_struct, bool request, + sector_t sector) +{ + return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <= + BFQQ_SEEK_THR; +} + +static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) +{ + struct rb_root *root = &bfqd->rq_pos_tree; + struct rb_node *parent, *node; + struct bfq_queue *__bfqq; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + /* + * First, if we find a request starting at the end of the last + * request, choose it. + */ + __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); + if (__bfqq != NULL) + return __bfqq; + + /* + * If the exact sector wasn't found, the parent of the NULL leaf + * will contain the closest sector (rq_pos_tree sorted by + * next_request position). + */ + __bfqq = rb_entry(parent, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + if (blk_rq_pos(__bfqq->next_rq) < sector) + node = rb_next(&__bfqq->pos_node); + else + node = rb_prev(&__bfqq->pos_node); + if (node == NULL) + return NULL; + + __bfqq = rb_entry(node, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + return NULL; +} + +/* + * bfqd - obvious + * cur_bfqq - passed in so that we don't decide that the current queue + * is closely cooperating with itself + * sector - used as a reference point to search for a close queue + */ +static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, + struct bfq_queue *cur_bfqq, + sector_t sector) +{ + struct bfq_queue *bfqq; + + if (bfq_class_idle(cur_bfqq)) + return NULL; + if (!bfq_bfqq_sync(cur_bfqq)) + return NULL; + if (BFQQ_SEEKY(cur_bfqq)) + return NULL; + + /* If device has only one backlogged bfq_queue, don't search. */ + if (bfqd->busy_queues == 1) + return NULL; + + /* + * We should notice if some of the queues are cooperating, e.g. + * working closely on the same area of the disk. In that case, + * we can group them together and don't waste time idling. + */ + bfqq = bfqq_close(bfqd, sector); + if (bfqq == NULL || bfqq == cur_bfqq) + return NULL; + + /* + * Do not merge queues from different bfq_groups. + */ + if (bfqq->entity.parent != cur_bfqq->entity.parent) + return NULL; + + /* + * It only makes sense to merge sync queues. + */ + if (!bfq_bfqq_sync(bfqq)) + return NULL; + if (BFQQ_SEEKY(bfqq)) + return NULL; + + /* + * Do not merge queues of different priority classes. + */ + if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) + return NULL; + + return bfqq; +} + +static struct bfq_queue * +bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + int process_refs, new_process_refs; + struct bfq_queue *__bfqq; + + /* + * If there are no process references on the new_bfqq, then it is + * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain + * may have dropped their last reference (not just their last process + * reference). + */ + if (!bfqq_process_refs(new_bfqq)) + return NULL; + + /* Avoid a circular list and skip interim queue merges. */ + while ((__bfqq = new_bfqq->new_bfqq)) { + if (__bfqq == bfqq) + return NULL; + new_bfqq = __bfqq; + } + + process_refs = bfqq_process_refs(bfqq); + new_process_refs = bfqq_process_refs(new_bfqq); + /* + * If the process for the bfqq has gone away, there is no + * sense in merging the queues. + */ + if (process_refs == 0 || new_process_refs == 0) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", + new_bfqq->pid); + + /* + * Merging is just a redirection: the requests of the process + * owning one of the two queues are redirected to the other queue. + * The latter queue, in its turn, is set as shared if this is the + * first time that the requests of some process are redirected to + * it. + * + * We redirect bfqq to new_bfqq and not the opposite, because we + * are in the context of the process owning bfqq, hence we have + * the io_cq of this process. So we can immediately configure this + * io_cq to redirect the requests of the process to new_bfqq. + * + * NOTE, even if new_bfqq coincides with the in-service queue, the + * io_cq of new_bfqq is not available, because, if the in-service + * queue is shared, bfqd->in_service_bic may not point to the + * io_cq of the in-service queue. + * Redirecting the requests of the process owning bfqq to the + * currently in-service queue is in any case the best option, as + * we feed the in-service queue with new requests close to the + * last request served and, by doing so, hopefully increase the + * throughput. + */ + bfqq->new_bfqq = new_bfqq; + atomic_add(process_refs, &new_bfqq->ref); + return new_bfqq; +} + +/* + * Attempt to schedule a merge of bfqq with the currently in-service queue + * or with a close queue among the scheduled queues. + * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue + * structure otherwise. + * + * The OOM queue is not allowed to participate to cooperation: in fact, since + * the requests temporarily redirected to the OOM queue could be redirected + * again to dedicated queues at any time, the state needed to correctly + * handle merging with the OOM queue would be quite complex and expensive + * to maintain. Besides, in such a critical condition as an out of memory, + * the benefits of queue merging may be little relevant, or even negligible. + */ +static struct bfq_queue * +bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + void *io_struct, bool request) +{ + struct bfq_queue *in_service_bfqq, *new_bfqq; + + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; + + in_service_bfqq = bfqd->in_service_queue; + + if (in_service_bfqq == NULL || in_service_bfqq == bfqq || + !bfqd->in_service_bic || + unlikely(in_service_bfqq == &bfqd->oom_bfqq)) + goto check_scheduled; + + if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq)) + goto check_scheduled; + + if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq)) + goto check_scheduled; + + if (in_service_bfqq->entity.parent != bfqq->entity.parent) + goto check_scheduled; + + if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) { + new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); + if (new_bfqq != NULL) + return new_bfqq; /* Merge with in-service queue */ + } + + /* + * Check whether there is a cooperator among currently scheduled + * queues. The only thing we need is that the bio/request is not + * NULL, as we need it to establish whether a cooperator exists. + */ +check_scheduled: + new_bfqq = bfq_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + + return NULL; +} + +static inline void +bfq_bfqq_save_state(struct bfq_queue *bfqq) +{ + /* + * If bfqq->bic == NULL, the queue is already shared or its requests + * have already been redirected to a shared queue; both idle window + * and weight raising state have already been saved. Do nothing. + */ + if (bfqq->bic == NULL) + return; + if (bfqq->bic->wr_time_left) + /* + * This is the queue of a just-started process, and would + * deserve weight raising: we set wr_time_left to the full + * weight-raising duration to trigger weight-raising when + * and if the queue is split and the first request of the + * queue is enqueued. + */ + bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); + else if (bfqq->wr_coeff > 1) { + unsigned long wr_duration = + jiffies - bfqq->last_wr_start_finish; + /* + * It may happen that a queue's weight raising period lasts + * longer than its wr_cur_max_time, as weight raising is + * handled only when a request is enqueued or dispatched (it + * does not use any timer). If the weight raising period is + * about to end, don't save it. + */ + if (bfqq->wr_cur_max_time <= wr_duration) + bfqq->bic->wr_time_left = 0; + else + bfqq->bic->wr_time_left = + bfqq->wr_cur_max_time - wr_duration; + /* + * The bfq_queue is becoming shared or the requests of the + * process owning the queue are being redirected to a shared + * queue. Stop the weight raising period of the queue, as in + * both cases it should not be owned by an interactive or + * soft real-time application. + */ + bfq_bfqq_end_wr(bfqq); + } else + bfqq->bic->wr_time_left = 0; + bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bfqq->bic->cooperations++; + bfqq->bic->failed_cooperations = 0; +} + +static inline void +bfq_get_bic_reference(struct bfq_queue *bfqq) +{ + /* + * If bfqq->bic has a non-NULL value, the bic to which it belongs + * is about to begin using a shared bfq_queue. + */ + if (bfqq->bic) + atomic_long_inc(&bfqq->bic->icq.ioc->refcount); +} + +static void +bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (long unsigned)new_bfqq->pid); + /* Save weight raising and idle window of the merged queues */ + bfq_bfqq_save_state(bfqq); + bfq_bfqq_save_state(new_bfqq); + if (bfq_bfqq_IO_bound(bfqq)) + bfq_mark_bfqq_IO_bound(new_bfqq); + bfq_clear_bfqq_IO_bound(bfqq); + /* + * Grab a reference to the bic, to prevent it from being destroyed + * before being possibly touched by a bfq_split_bfqq(). + */ + bfq_get_bic_reference(bfqq); + bfq_get_bic_reference(new_bfqq); + /* + * Merge queues (that is, let bic redirect its requests to new_bfqq) + */ + bic_set_bfqq(bic, new_bfqq, 1); + bfq_mark_bfqq_coop(new_bfqq); + /* + * new_bfqq now belongs to at least two bics (it is a shared queue): + * set new_bfqq->bic to NULL. bfqq either: + * - does not belong to any bic any more, and hence bfqq->bic must + * be set to NULL, or + * - is a queue whose owning bics have already been redirected to a + * different queue, hence the queue is destined to not belong to + * any bic soon and bfqq->bic is already NULL (therefore the next + * assignment causes no harm). + */ + new_bfqq->bic = NULL; + bfqq->bic = NULL; + bfq_put_queue(bfqq); +} + +static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) +{ + struct bfq_io_cq *bic = bfqq->bic; + struct bfq_data *bfqd = bfqq->bfqd; + + if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { + bic->failed_cooperations++; + if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) + bic->cooperations = 0; + } +} + +static int bfq_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq, *new_bfqq; + + /* + * Disallow merge of a sync bio into an async request. + */ + if (bfq_bio_sync(bio) && !rq_is_sync(rq)) + return 0; + + /* + * Lookup the bfqq that this bio will be queued with. Allow + * merge only if rq is queued there. + * Queue lock is held here. + */ + bic = bfq_bic_lookup(bfqd, current->io_context); + if (bic == NULL) + return 0; + + bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); + /* + * We take advantage of this function to perform an early merge + * of the queues of possible cooperating processes. + */ + if (bfqq != NULL) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + if (new_bfqq != NULL) { + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + /* + * If we get here, the bio will be queued in the + * shared queue, i.e., new_bfqq, so use new_bfqq + * to decide whether bio and rq can be merged. + */ + bfqq = new_bfqq; + } else + bfq_bfqq_increase_failed_cooperations(bfqq); + } + + return bfqq == RQ_BFQQ(rq); +} + +static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + if (bfqq != NULL) { + bfq_mark_bfqq_must_alloc(bfqq); + bfq_mark_bfqq_budget_new(bfqq); + bfq_clear_bfqq_fifo_expire(bfqq); + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + + bfq_log_bfqq(bfqd, bfqq, + "set_in_service_queue, cur-budget = %lu", + bfqq->entity.budget); + } + + bfqd->in_service_queue = bfqq; +} + +/* + * Get and set a new queue for service. + */ +static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); + + __bfq_set_in_service_queue(bfqd, bfqq); + return bfqq; +} + +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < 194) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < 194) + return bfq_default_max_budget / 32; + else + return bfqd->bfq_max_budget / 32; +} + +static void bfq_arm_slice_timer(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + struct bfq_io_cq *bic; + unsigned long sl; + + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + /* Processes have exited, don't wait. */ + bic = bfqd->in_service_bic; + if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0) + return; + + bfq_mark_bfqq_wait_request(bfqq); + + /* + * We don't want to idle for seeks, but we do want to allow + * fair distribution of slice time for a process doing back-to-back + * seeks. So allow a little bit of time for him to submit a new rq. + * + * To prevent processes with (partly) seeky workloads from + * being too ill-treated, grant them a small fraction of the + * assigned budget before reducing the waiting time to + * BFQ_MIN_TT. This happened to help reduce latency. + */ + sl = bfqd->bfq_slice_idle; + /* + * Unless the queue is being weight-raised or the scenario is + * asymmetric, grant only minimum idle time if the queue either + * has been seeky for long enough or has already proved to be + * constantly seeky. + */ + if (bfq_sample_valid(bfqq->seek_samples) && + ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > + bfq_max_budget(bfqq->bfqd) / 8) || + bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && + symmetric_scenario) + sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); + else if (bfqq->wr_coeff > 1) + sl = sl * 3; + bfqd->last_idling_start = ktime_get(); + mod_timer(&bfqd->idle_slice_timer, jiffies + sl); + bfq_log(bfqd, "arm idle: %u/%u ms", + jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); +} + +/* + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the disk + * throughput (always guaranteed with a time slice scheme as in CFQ). + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + unsigned int timeout_coeff; + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + + bfqd->last_budget_start = ktime_get(); + + bfq_clear_bfqq_budget_new(bfqq); + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; + + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", + jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * + timeout_coeff)); +} + +/* + * Move request from internal lists to the request queue dispatch list. + */ +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + /* + * For consistency, the next instruction should have been executed + * after removing the request from the queue and dispatching it. + * We execute instead this instruction before bfq_remove_request() + * (and hence introduce a temporary inconsistency), for efficiency. + * In fact, in a forced_dispatch, this prevents two counters related + * to bfqq->dispatched to risk to be uselessly decremented if bfqq + * is not in service, and then to be incremented again after + * incrementing bfqq->dispatched. + */ + bfqq->dispatched++; + bfq_remove_request(rq); + elv_dispatch_sort(q, rq); + + if (bfq_bfqq_sync(bfqq)) + bfqd->sync_flight++; +} + +/* + * Return expired entry, or NULL to just start from scratch in rbtree. + */ +static struct request *bfq_check_fifo(struct bfq_queue *bfqq) +{ + struct request *rq = NULL; + + if (bfq_bfqq_fifo_expire(bfqq)) + return NULL; + + bfq_mark_bfqq_fifo_expire(bfqq); + + if (list_empty(&bfqq->fifo)) + return NULL; + + rq = rq_entry_fifo(bfqq->fifo.next); + + if (time_before(jiffies, rq->fifo_time)) + return NULL; + + return rq; +} + +static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + return entity->budget - entity->service; +} + +static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfqq != bfqd->in_service_queue); + + __bfq_bfqd_reset_in_service(bfqd); + + /* + * If this bfqq is shared between multiple processes, check + * to make sure that those processes are still issuing I/Os + * within the mean seek distance. If not, it may be time to + * break the queues apart again. + */ + if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) + bfq_mark_bfqq_split_coop(bfqq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * Overloading budget_timeout field to store the time + * at which the queue remains with no backlog; used by + * the weight-raising mechanism. + */ + bfqq->budget_timeout = jiffies; + bfq_del_bfqq_busy(bfqd, bfqq, 1); + } else { + bfq_activate_bfqq(bfqd, bfqq); + /* + * Resort priority tree of potential close cooperators. + */ + bfq_rq_pos_tree_add(bfqd, bfqq); + } +} + +/** + * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. + * @bfqd: device data. + * @bfqq: queue to update. + * @reason: reason for expiration. + * + * Handle the feedback on @bfqq budget. See the body for detailed + * comments. + */ +static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + enum bfqq_expiration reason) +{ + struct request *next_rq; + unsigned long budget, min_budget; + + budget = bfqq->max_budget; + min_budget = bfq_min_budget(bfqd); + + BUG_ON(bfqq != bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", + budget, bfq_min_budget(bfqd)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); + + if (bfq_bfqq_sync(bfqq)) { + switch (reason) { + /* + * Caveat: in all the following cases we trade latency + * for throughput. + */ + case BFQ_BFQQ_TOO_IDLE: + /* + * This is the only case where we may reduce + * the budget: if there is no request of the + * process still waiting for completion, then + * we assume (tentatively) that the timer has + * expired because the batch of requests of + * the process could have been served with a + * smaller budget. Hence, betting that + * process will behave in the same way when it + * becomes backlogged again, we reduce its + * next budget. As long as we guess right, + * this budget cut reduces the latency + * experienced by the process. + * + * However, if there are still outstanding + * requests, then the process may have not yet + * issued its next request just because it is + * still waiting for the completion of some of + * the still outstanding ones. So in this + * subcase we do not reduce its budget, on the + * contrary we increase it to possibly boost + * the throughput, as discussed in the + * comments to the BUDGET_TIMEOUT case. + */ + if (bfqq->dispatched > 0) /* still outstanding reqs */ + budget = min(budget * 2, bfqd->bfq_max_budget); + else { + if (budget > 5 * min_budget) + budget -= 4 * min_budget; + else + budget = min_budget; + } + break; + case BFQ_BFQQ_BUDGET_TIMEOUT: + /* + * We double the budget here because: 1) it + * gives the chance to boost the throughput if + * this is not a seeky process (which may have + * bumped into this timeout because of, e.g., + * ZBR), 2) together with charge_full_budget + * it helps give seeky processes higher + * timestamps, and hence be served less + * frequently. + */ + budget = min(budget * 2, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_BUDGET_EXHAUSTED: + /* + * The process still has backlog, and did not + * let either the budget timeout or the disk + * idling timeout expire. Hence it is not + * seeky, has a short thinktime and may be + * happy with a higher budget too. So + * definitely increase the budget of this good + * candidate to boost the disk throughput. + */ + budget = min(budget * 4, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_NO_MORE_REQUESTS: + /* + * Leave the budget unchanged. + */ + default: + return; + } + } else /* async queue */ + /* async queues get always the maximum possible budget + * (their ability to dispatch is limited by + * @bfqd->bfq_max_budget_async_rq). + */ + budget = bfqd->bfq_max_budget; + + bfqq->max_budget = budget; + + if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && + bfqq->max_budget > bfqd->bfq_max_budget) + bfqq->max_budget = bfqd->bfq_max_budget; + + /* + * Make sure that we have enough budget for the next request. + * Since the finish time of the bfqq must be kept in sync with + * the budget, be sure to call __bfq_bfqq_expire() after the + * update. + */ + next_rq = bfqq->next_rq; + if (next_rq != NULL) + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + else + bfqq->entity.budget = bfqq->max_budget; + + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", + next_rq != NULL ? blk_rq_sectors(next_rq) : 0, + bfqq->entity.budget); +} + +static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) +{ + unsigned long max_budget; + + /* + * The max_budget calculated when autotuning is equal to the + * amount of sectors transfered in timeout_sync at the + * estimated peak rate. + */ + max_budget = (unsigned long)(peak_rate * 1000 * + timeout >> BFQ_RATE_SHIFT); + + return max_budget; +} + +/* + * In addition to updating the peak rate, checks whether the process + * is "slow", and returns 1 if so. This slow flag is used, in addition + * to the budget timeout, to reduce the amount of service provided to + * seeky processes, and hence reduce their chances to lower the + * throughput. See the code for more details. + */ +static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int compensate, enum bfqq_expiration reason) +{ + u64 bw, usecs, expected, timeout; + ktime_t delta; + int update = 0; + + if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) + return 0; + + if (compensate) + delta = bfqd->last_idling_start; + else + delta = ktime_get(); + delta = ktime_sub(delta, bfqd->last_budget_start); + usecs = ktime_to_us(delta); + + /* Don't trust short/unrealistic values. */ + if (usecs < 100 || usecs >= LONG_MAX) + return 0; + + /* + * Calculate the bandwidth for the last slice. We use a 64 bit + * value to store the peak rate, in sectors per usec in fixed + * point math. We do so to have enough precision in the estimate + * and to avoid overflows. + */ + bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; + do_div(bw, (unsigned long)usecs); + + timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + /* + * Use only long (> 20ms) intervals to filter out spikes for + * the peak rate estimation. + */ + if (usecs > 20000) { + if (bw > bfqd->peak_rate || + (!BFQQ_SEEKY(bfqq) && + reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { + bfq_log(bfqd, "measured bw =%llu", bw); + /* + * To smooth oscillations use a low-pass filter with + * alpha=7/8, i.e., + * new_rate = (7/8) * old_rate + (1/8) * bw + */ + do_div(bw, 8); + if (bw == 0) + return 0; + bfqd->peak_rate *= 7; + do_div(bfqd->peak_rate, 8); + bfqd->peak_rate += bw; + update = 1; + bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); + } + + update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; + + if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) + bfqd->peak_rate_samples++; + + if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && + update) { + int dev_type = blk_queue_nonrot(bfqd->queue); + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd->peak_rate, + timeout); + bfq_log(bfqd, "new max_budget=%lu", + bfqd->bfq_max_budget); + } + if (bfqd->device_speed == BFQ_BFQD_FAST && + bfqd->peak_rate < device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_SLOW; + bfqd->RT_prod = R_slow[dev_type] * + T_slow[dev_type]; + } else if (bfqd->device_speed == BFQ_BFQD_SLOW && + bfqd->peak_rate > device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } + } + } + + /* + * If the process has been served for a too short time + * interval to let its possible sequential accesses prevail on + * the initial seek time needed to move the disk head on the + * first sector it requested, then give the process a chance + * and for the moment return false. + */ + if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) + return 0; + + /* + * A process is considered ``slow'' (i.e., seeky, so that we + * cannot treat it fairly in the service domain, as it would + * slow down too much the other processes) if, when a slice + * ends for whatever reason, it has received service at a + * rate that would not be high enough to complete the budget + * before the budget timeout expiration. + */ + expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; + + /* + * Caveat: processes doing IO in the slower disk zones will + * tend to be slow(er) even if not seeky. And the estimated + * peak rate will actually be an average over the disk + * surface. Hence, to not be too harsh with unlucky processes, + * we keep a budget/3 margin of safety before declaring a + * process slow. + */ + return expected > (4 * bfqq->entity.budget) / 3; +} + +/* + * To be deemed as soft real-time, an application must meet two + * requirements. First, the application must not require an average + * bandwidth higher than the approximate bandwidth required to playback or + * record a compressed high-definition video. + * The next function is invoked on the completion of the last request of a + * batch, to compute the next-start time instant, soft_rt_next_start, such + * that, if the next request of the application does not arrive before + * soft_rt_next_start, then the above requirement on the bandwidth is met. + * + * The second requirement is that the request pattern of the application is + * isochronous, i.e., that, after issuing a request or a batch of requests, + * the application stops issuing new requests until all its pending requests + * have been completed. After that, the application may issue a new batch, + * and so on. + * For this reason the next function is invoked to compute + * soft_rt_next_start only for applications that meet this requirement, + * whereas soft_rt_next_start is set to infinity for applications that do + * not. + * + * Unfortunately, even a greedy application may happen to behave in an + * isochronous way if the CPU load is high. In fact, the application may + * stop issuing requests while the CPUs are busy serving other processes, + * then restart, then stop again for a while, and so on. In addition, if + * the disk achieves a low enough throughput with the request pattern + * issued by the application (e.g., because the request pattern is random + * and/or the device is slow), then the application may meet the above + * bandwidth requirement too. To prevent such a greedy application to be + * deemed as soft real-time, a further rule is used in the computation of + * soft_rt_next_start: soft_rt_next_start must be higher than the current + * time plus the maximum time for which the arrival of a request is waited + * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. + * This filters out greedy applications, as the latter issue instead their + * next request as soon as possible after the last one has been completed + * (in contrast, when a batch of requests is completed, a soft real-time + * application spends some time processing data). + * + * Unfortunately, the last filter may easily generate false positives if + * only bfqd->bfq_slice_idle is used as a reference time interval and one + * or both the following cases occur: + * 1) HZ is so low that the duration of a jiffy is comparable to or higher + * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with + * HZ=100. + * 2) jiffies, instead of increasing at a constant rate, may stop increasing + * for a while, then suddenly 'jump' by several units to recover the lost + * increments. This seems to happen, e.g., inside virtual machines. + * To address this issue, we do not use as a reference time interval just + * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In + * particular we add the minimum number of jiffies for which the filter + * seems to be quite precise also in embedded systems and KVM/QEMU virtual + * machines. + */ +static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return max(bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, + jiffies + bfqq->bfqd->bfq_slice_idle + 4); +} + +/* + * Return the largest-possible time instant such that, for as long as possible, + * the current time will be lower than this time instant according to the macro + * time_is_before_jiffies(). + */ +static inline unsigned long bfq_infinity_from_now(unsigned long now) +{ + return now + ULONG_MAX / 2; +} + +/** + * bfq_bfqq_expire - expire a queue. + * @bfqd: device owning the queue. + * @bfqq: the queue to expire. + * @compensate: if true, compensate for the time spent idling. + * @reason: the reason causing the expiration. + * + * + * If the process associated to the queue is slow (i.e., seeky), or in + * case of budget timeout, or, finally, if it is async, we + * artificially charge it an entire budget (independently of the + * actual service it received). As a consequence, the queue will get + * higher timestamps than the correct ones upon reactivation, and + * hence it will be rescheduled as if it had received more service + * than what it actually received. In the end, this class of processes + * will receive less service in proportion to how slowly they consume + * their budgets (and hence how seriously they tend to lower the + * throughput). + * + * In contrast, when a queue expires because it has been idling for + * too much or because it exhausted its budget, we do not touch the + * amount of service it has received. Hence when the queue will be + * reactivated and its timestamps updated, the latter will be in sync + * with the actual service received by the queue until expiration. + * + * Charging a full budget to the first type of queues and the exact + * service to the others has the effect of using the WF2Q+ policy to + * schedule the former on a timeslice basis, without violating the + * service domain guarantees of the latter. + */ +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + int compensate, + enum bfqq_expiration reason) +{ + int slow; + BUG_ON(bfqq != bfqd->in_service_queue); + + /* Update disk peak rate for autotuning and check whether the + * process is slow (see bfq_update_peak_rate). + */ + slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); + + /* + * As above explained, 'punish' slow (i.e., seeky), timed-out + * and async queues, to favor sequential sync workloads. + * + * Processes doing I/O in the slower disk zones will tend to be + * slow(er) even if not seeky. Hence, since the estimated peak + * rate is actually an average over the disk surface, these + * processes may timeout just for bad luck. To avoid punishing + * them we do not charge a full budget to a process that + * succeeded in consuming at least 2/3 of its budget. + */ + if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) + bfq_bfqq_charge_full_budget(bfqq); + + bfqq->service_from_backlogged += bfqq->entity.service; + + if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && + !bfq_bfqq_constantly_seeky(bfqq)) { + bfq_mark_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) + bfqd->const_seeky_busy_in_flight_queues++; + } + + if (reason == BFQ_BFQQ_TOO_IDLE && + bfqq->entity.service <= 2 * bfqq->entity.budget / 10 ) + bfq_clear_bfqq_IO_bound(bfqq); + + if (bfqd->low_latency && bfqq->wr_coeff == 1) + bfqq->last_wr_start_finish = jiffies; + + if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * If we get here, and there are no outstanding requests, + * then the request pattern is isochronous (see the comments + * to the function bfq_bfqq_softrt_next_start()). Hence we + * can compute soft_rt_next_start. If, instead, the queue + * still has outstanding requests, then we have to wait + * for the completion of all the outstanding requests to + * discover whether the request pattern is actually + * isochronous. + */ + if (bfqq->dispatched == 0) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + else { + /* + * The application is still waiting for the + * completion of one or more requests: + * prevent it from possibly being incorrectly + * deemed as soft real-time by setting its + * soft_rt_next_start to infinity. In fact, + * without this assignment, the application + * would be incorrectly deemed as soft + * real-time if: + * 1) it issued a new request before the + * completion of all its in-flight + * requests, and + * 2) at that time, its soft_rt_next_start + * happened to be in the past. + */ + bfqq->soft_rt_next_start = + bfq_infinity_from_now(jiffies); + /* + * Schedule an update of soft_rt_next_start to when + * the task may be discovered to be isochronous. + */ + bfq_mark_bfqq_softrt_update(bfqq); + } + } + + bfq_log_bfqq(bfqd, bfqq, + "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, + slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + + /* + * Increase, decrease or leave budget unchanged according to + * reason. + */ + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + __bfq_bfqq_expire(bfqd, bfqq); +} + +/* + * Budget timeout is not implemented through a dedicated timer, but + * just checked on request arrivals and completions, as well as on + * idle timer expirations. + */ +static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_budget_new(bfqq) || + time_before(jiffies, bfqq->budget_timeout)) + return 0; + return 1; +} + +/* + * If we expire a queue that is waiting for the arrival of a new + * request, we may prevent the fictitious timestamp back-shifting that + * allows the guarantees of the queue to be preserved (see [1] for + * this tricky aspect). Hence we return true only if this condition + * does not hold, or if the queue is slow enough to deserve only to be + * kicked off for preserving a high throughput. +*/ +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, + "may_budget_timeout: wait_request %d left %d timeout %d", + bfq_bfqq_wait_request(bfqq), + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, + bfq_bfqq_budget_timeout(bfqq)); + + return (!bfq_bfqq_wait_request(bfqq) || + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) + && + bfq_bfqq_budget_timeout(bfqq); +} + +/* + * Device idling is allowed only for the queues for which this function + * returns true. For this reason, the return value of this function plays a + * critical role for both throughput boosting and service guarantees. The + * return value is computed through a logical expression. In this rather + * long comment, we try to briefly describe all the details and motivations + * behind the components of this logical expression. + * + * First, the expression is false if bfqq is not sync, or if: bfqq happened + * to become active during a large burst of queue activations, and the + * pattern of requests bfqq contains boosts the throughput if bfqq is + * expired. In fact, queues that became active during a large burst benefit + * only from throughput, as discussed in the comments to bfq_handle_burst. + * In this respect, expiring bfqq certainly boosts the throughput on NCQ- + * capable flash-based devices, whereas, on rotational devices, it boosts + * the throughput only if bfqq contains random requests. + * + * On the opposite end, if (a) bfqq is sync, (b) the above burst-related + * condition does not hold, and (c) bfqq is being weight-raised, then the + * expression always evaluates to true, as device idling is instrumental + * for preserving low-latency guarantees (see [1]). If, instead, conditions + * (a) and (b) do hold, but (c) does not, then the expression evaluates to + * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and + * (2) at least one of the following two conditions holds. + * The first condition is that the device is not performing NCQ, because + * idling the device most certainly boosts the throughput if this condition + * holds and bfqq is I/O-bound and has been granted a non-null idle window. + * The second compound condition is made of the logical AND of two components. + * + * The first component is true only if there is no weight-raised busy + * queue. This guarantees that the device is not idled for a sync non- + * weight-raised queue when there are busy weight-raised queues. The former + * is then expired immediately if empty. Combined with the timestamping + * rules of BFQ (see [1] for details), this causes sync non-weight-raised + * queues to get a lower number of requests served, and hence to ask for a + * lower number of requests from the request pool, before the busy weight- + * raised queues get served again. + * + * This is beneficial for the processes associated with weight-raised + * queues, when the request pool is saturated (e.g., in the presence of + * write hogs). In fact, if the processes associated with the other queues + * ask for requests at a lower rate, then weight-raised processes have a + * higher probability to get a request from the pool immediately (or at + * least soon) when they need one. Hence they have a higher probability to + * actually get a fraction of the disk throughput proportional to their + * high weight. This is especially true with NCQ-capable drives, which + * enqueue several requests in advance and further reorder internally- + * queued requests. + * + * In the end, mistreating non-weight-raised queues when there are busy + * weight-raised queues seems to mitigate starvation problems in the + * presence of heavy write workloads and NCQ, and hence to guarantee a + * higher application and system responsiveness in these hostile scenarios. + * + * If the first component of the compound condition is instead true, i.e., + * there is no weight-raised busy queue, then the second component of the + * compound condition takes into account service-guarantee and throughput + * issues related to NCQ (recall that the compound condition is evaluated + * only if the device is detected as supporting NCQ). + * + * As for service guarantees, allowing the drive to enqueue more than one + * request at a time, and hence delegating de facto final scheduling + * decisions to the drive's internal scheduler, causes loss of control on + * the actual request service order. In this respect, when the drive is + * allowed to enqueue more than one request at a time, the service + * distribution enforced by the drive's internal scheduler is likely to + * coincide with the desired device-throughput distribution only in the + * following, perfectly symmetric, scenario: + * 1) all active queues have the same weight, + * 2) all active groups at the same level in the groups tree have the same + * weight, + * 3) all active groups at the same level in the groups tree have the same + * number of children. + * + * Even in such a scenario, sequential I/O may still receive a preferential + * treatment, but this is not likely to be a big issue with flash-based + * devices, because of their non-dramatic loss of throughput with random + * I/O. Things do differ with HDDs, for which additional care is taken, as + * explained after completing the discussion for flash-based devices. + * + * Unfortunately, keeping the necessary state for evaluating exactly the + * above symmetry conditions would be quite complex and time-consuming. + * Therefore BFQ evaluates instead the following stronger sub-conditions, + * for which it is much easier to maintain the needed state: + * 1) all active queues have the same weight, + * 2) all active groups have the same weight, + * 3) all active groups have at most one active child each. + * In particular, the last two conditions are always true if hierarchical + * support and the cgroups interface are not enabled, hence no state needs + * to be maintained in this case. + * + * According to the above considerations, the second component of the + * compound condition evaluates to true if any of the above symmetry + * sub-condition does not hold, or the device is not flash-based. Therefore, + * if also the first component is true, then idling is allowed for a sync + * queue. These are the only sub-conditions considered if the device is + * flash-based, as, for such a device, it is sensible to force idling only + * for service-guarantee issues. In fact, as for throughput, idling + * NCQ-capable flash-based devices would not boost the throughput even + * with sequential I/O; rather it would lower the throughput in proportion + * to how fast the device is. In the end, (only) if all the three + * sub-conditions hold and the device is flash-based, the compound + * condition evaluates to false and therefore no idling is performed. + * + * As already said, things change with a rotational device, where idling + * boosts the throughput with sequential I/O (even with NCQ). Hence, for + * such a device the second component of the compound condition evaluates + * to true also if the following additional sub-condition does not hold: + * the queue is constantly seeky. Unfortunately, this different behavior + * with respect to flash-based devices causes an additional asymmetry: if + * some sync queues enjoy idling and some other sync queues do not, then + * the latter get a low share of the device throughput, simply because the + * former get many requests served after being set as in service, whereas + * the latter do not. As a consequence, to guarantee the desired throughput + * distribution, on HDDs the compound expression evaluates to true (and + * hence device idling is performed) also if the following last symmetry + * condition does not hold: no other queue is benefiting from idling. Also + * this last condition is actually replaced with a simpler-to-maintain and + * stronger condition: there is no busy queue which is not constantly seeky + * (and hence may also benefit from idling). + * + * To sum up, when all the required symmetry and throughput-boosting + * sub-conditions hold, the second component of the compound condition + * evaluates to false, and hence no idling is performed. This helps to + * keep the drives' internal queues full on NCQ-capable devices, and hence + * to boost the throughput, without causing 'almost' any loss of service + * guarantees. The 'almost' follows from the fact that, if the internal + * queue of one such device is filled while all the sub-conditions hold, + * but at some point in time some sub-condition stops to hold, then it may + * become impossible to let requests be served in the new desired order + * until all the requests already queued in the device have been served. + */ +static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; +#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \ + bfqd->busy_in_flight_queues == \ + bfqd->const_seeky_busy_in_flight_queues) + +#define cond_for_expiring_in_burst (bfq_bfqq_in_large_burst(bfqq) && \ + bfqd->hw_tag && \ + (blk_queue_nonrot(bfqd->queue) || \ + bfq_bfqq_constantly_seeky(bfqq))) + +/* + * Condition for expiring a non-weight-raised queue (and hence not idling + * the device). + */ +#define cond_for_expiring_non_wr (bfqd->hw_tag && \ + (bfqd->wr_busy_queues > 0 || \ + (blk_queue_nonrot(bfqd->queue) || \ + cond_for_seeky_on_ncq_hdd))) + + return bfq_bfqq_sync(bfqq) && + !cond_for_expiring_in_burst && + (bfqq->wr_coeff > 1 || !symmetric_scenario || + (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) && + !cond_for_expiring_non_wr) + ); +} + +/* + * If the in-service queue is empty but sync, and the function + * bfq_bfqq_must_not_expire returns true, then: + * 1) the queue must remain in service and cannot be expired, and + * 2) the disk must be idled to wait for the possible arrival of a new + * request for the queue. + * See the comments to the function bfq_bfqq_must_not_expire for the reasons + * why performing device idling is the best choice to boost the throughput + * and preserve service guarantees when bfq_bfqq_must_not_expire itself + * returns true. + */ +static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && + bfq_bfqq_must_not_expire(bfqq); +} + +/* + * Select a queue for service. If we have a current queue in service, + * check whether to continue servicing it, or retrieve and set a new one. + */ +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + struct request *next_rq; + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; + + bfqq = bfqd->in_service_queue; + if (bfqq == NULL) + goto new_queue; + + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + + if (bfq_may_expire_for_budg_timeout(bfqq) && + !timer_pending(&bfqd->idle_slice_timer) && + !bfq_bfqq_must_idle(bfqq)) + goto expire; + + next_rq = bfqq->next_rq; + /* + * If bfqq has requests queued and it has enough budget left to + * serve them, keep the queue, otherwise expire it. + */ + if (next_rq != NULL) { + if (bfq_serv_to_charge(next_rq, bfqq) > + bfq_bfqq_budget_left(bfqq)) { + reason = BFQ_BFQQ_BUDGET_EXHAUSTED; + goto expire; + } else { + /* + * The idle timer may be pending because we may + * not disable disk idling even when a new request + * arrives. + */ + if (timer_pending(&bfqd->idle_slice_timer)) { + /* + * If we get here: 1) at least a new request + * has arrived but we have not disabled the + * timer because the request was too small, + * 2) then the block layer has unplugged + * the device, causing the dispatch to be + * invoked. + * + * Since the device is unplugged, now the + * requests are probably large enough to + * provide a reasonable throughput. + * So we disable idling. + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); + } + goto keep_queue; + } + } + + /* + * No requests pending. However, if the in-service queue is idling + * for a new request, or has requests waiting for a completion and + * may idle after their completion, then keep it anyway. + */ + if (timer_pending(&bfqd->idle_slice_timer) || + (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) { + bfqq = NULL; + goto keep_queue; + } + + reason = BFQ_BFQQ_NO_MORE_REQUESTS; +expire: + bfq_bfqq_expire(bfqd, bfqq, 0, reason); +new_queue: + bfqq = bfq_set_in_service_queue(bfqd); + bfq_log(bfqd, "select_queue: new queue %d returned", + bfqq != NULL ? bfqq->pid : 0); +keep_queue: + return bfqq; +} + +static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + + BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != + entity->orig_weight * bfqq->wr_coeff); + if (entity->ioprio_changed) + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); + + /* + * If the queue was activated in a burst, or + * too much time has elapsed from the beginning + * of this weight-raising period, or the queue has + * exceeded the acceptable number of cooperations, + * then end weight raising. + */ + if (bfq_bfqq_in_large_burst(bfqq) || + bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + bfqq->last_wr_start_finish = jiffies; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + bfq_bfqq_end_wr(bfqq); + } + } + /* Update weight both if it must be raised and if it must be lowered */ + if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) + __bfq_entity_update_weight_prio( + bfq_entity_service_tree(entity), + entity); +} + +/* + * Dispatch one request from bfqq, moving it to the request queue + * dispatch list. + */ +static int bfq_dispatch_request(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + int dispatched = 0; + struct request *rq; + unsigned long service_to_charge; + + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + /* Follow expired path, else get first next available. */ + rq = bfq_check_fifo(bfqq); + if (rq == NULL) + rq = bfqq->next_rq; + service_to_charge = bfq_serv_to_charge(rq, bfqq); + + if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { + /* + * This may happen if the next rq is chosen in fifo order + * instead of sector order. The budget is properly + * dimensioned to be always sufficient to serve the next + * request only if it is chosen in sector order. The reason + * is that it would be quite inefficient and little useful + * to always make sure that the budget is large enough to + * serve even the possible next rq in fifo order. + * In fact, requests are seldom served in fifo order. + * + * Expire the queue for budget exhaustion, and make sure + * that the next act_budget is enough to serve the next + * request, even if it comes from the fifo expired path. + */ + bfqq->next_rq = rq; + /* + * Since this dispatch is failed, make sure that + * a new one will be performed + */ + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + goto expire; + } + + /* Finally, insert request into driver dispatch list. */ + bfq_bfqq_served(bfqq, service_to_charge); + bfq_dispatch_insert(bfqd->queue, rq); + + bfq_update_wr_data(bfqd, bfqq); + + bfq_log_bfqq(bfqd, bfqq, + "dispatched %u sec req (%llu), budg left %lu", + blk_rq_sectors(rq), + (long long unsigned)blk_rq_pos(rq), + bfq_bfqq_budget_left(bfqq)); + + dispatched++; + + if (bfqd->in_service_bic == NULL) { + atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); + bfqd->in_service_bic = RQ_BIC(rq); + } + + if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && + dispatched >= bfqd->bfq_max_budget_async_rq) || + bfq_class_idle(bfqq))) + goto expire; + + return dispatched; + +expire: + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); + return dispatched; +} + +static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) +{ + int dispatched = 0; + + while (bfqq->next_rq != NULL) { + bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); + dispatched++; + } + + BUG_ON(!list_empty(&bfqq->fifo)); + return dispatched; +} + +/* + * Drain our current requests. + * Used for barriers and when switching io schedulers on-the-fly. + */ +static int bfq_forced_dispatch(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq, *n; + struct bfq_service_tree *st; + int dispatched = 0; + + bfqq = bfqd->in_service_queue; + if (bfqq != NULL) + __bfq_bfqq_expire(bfqd, bfqq); + + /* + * Loop through classes, and be careful to leave the scheduler + * in a consistent state, as feedback mechanisms and vtime + * updates cannot be disabled during the process. + */ + list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { + st = bfq_entity_service_tree(&bfqq->entity); + + dispatched += __bfq_forced_dispatch_bfqq(bfqq); + bfqq->max_budget = bfq_max_budget(bfqd); + + bfq_forget_idle(st); + } + + BUG_ON(bfqd->busy_queues != 0); + + return dispatched; +} + +static int bfq_dispatch_requests(struct request_queue *q, int force) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + int max_dispatch; + + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + if (bfqd->busy_queues == 0) + return 0; + + if (unlikely(force)) + return bfq_forced_dispatch(bfqd); + + bfqq = bfq_select_queue(bfqd); + if (bfqq == NULL) + return 0; + + if (bfq_class_idle(bfqq)) + max_dispatch = 1; + + if (!bfq_bfqq_sync(bfqq)) + max_dispatch = bfqd->bfq_max_budget_async_rq; + + if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { + if (bfqd->busy_queues > 1) + return 0; + if (bfqq->dispatched >= 4 * max_dispatch) + return 0; + } + + if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) + return 0; + + bfq_clear_bfqq_wait_request(bfqq); + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + + if (!bfq_dispatch_request(bfqd, bfqq)) + return 0; + + bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", + bfq_bfqq_sync(bfqq) ? "sync" : "async"); + + return 1; +} + +/* + * Task holds one reference to the queue, dropped when task exits. Each rq + * in-flight on this queue also holds a reference, dropped when rq is freed. + * + * Queue lock must be held here. + */ +static void bfq_put_queue(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + + BUG_ON(atomic_read(&bfqq->ref) <= 0); + + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, + atomic_read(&bfqq->ref)); + if (!atomic_dec_and_test(&bfqq->ref)) + return; + + BUG_ON(rb_first(&bfqq->sort_list) != NULL); + BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); + BUG_ON(bfqq->entity.tree != NULL); + BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqd->in_service_queue == bfqq); + + if (bfq_bfqq_sync(bfqq)) + /* + * The fact that this queue is being destroyed does not + * invalidate the fact that this queue may have been + * activated during the current burst. As a consequence, + * although the queue does not exist anymore, and hence + * needs to be removed from the burst list if there, + * the burst size has not to be decremented. + */ + hlist_del_init(&bfqq->burst_list_node); + + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); +} + +static void bfq_put_cooperator(struct bfq_queue *bfqq) +{ + struct bfq_queue *__bfqq, *next; + + /* + * If this queue was scheduled to merge with another queue, be + * sure to drop the reference taken on that queue (and others in + * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. + */ + __bfqq = bfqq->new_bfqq; + while (__bfqq) { + if (__bfqq == bfqq) + break; + next = __bfqq->new_bfqq; + bfq_put_queue(__bfqq); + __bfqq = next; + } +} + +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + if (bfqq == bfqd->in_service_queue) { + __bfq_bfqq_expire(bfqd, bfqq); + bfq_schedule_dispatch(bfqd); + } + + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, + atomic_read(&bfqq->ref)); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); +} + +static inline void bfq_init_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + + bic->ttime.last_end_request = jiffies; + /* + * A newly created bic indicates that the process has just + * started doing I/O, and is probably mapping into memory its + * executable and libraries: it definitely needs weight raising. + * There is however the possibility that the process performs, + * for a while, I/O close to some other process. EQM intercepts + * this behavior and may merge the queue corresponding to the + * process with some other queue, BEFORE the weight of the queue + * is raised. Merged queues are not weight-raised (they are assumed + * to belong to processes that benefit only from high throughput). + * If the merge is basically the consequence of an accident, then + * the queue will be split soon and will get back its old weight. + * It is then important to write down somewhere that this queue + * does need weight raising, even if it did not make it to get its + * weight raised before being merged. To this purpose, we overload + * the field raising_time_left and assign 1 to it, to mark the queue + * as needing weight raising. + */ + bic->wr_time_left = 1; +} + +static void bfq_exit_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + struct bfq_data *bfqd = bic_to_bfqd(bic); + + if (bic->bfqq[BLK_RW_ASYNC]) { + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); + bic->bfqq[BLK_RW_ASYNC] = NULL; + } + + if (bic->bfqq[BLK_RW_SYNC]) { + /* + * If the bic is using a shared queue, put the reference + * taken on the io_context when the bic started using a + * shared bfq_queue. + */ + if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) + put_io_context(icq->ioc); + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); + bic->bfqq[BLK_RW_SYNC] = NULL; + } +} + +/* + * Update the entity prio values; note that the new values will not + * be used until the next (re)activation. + */ +static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +{ + struct task_struct *tsk = current; + int ioprio_class; + + ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + switch (ioprio_class) { + default: + dev_err(bfqq->bfqd->queue->backing_dev_info.dev, + "bfq: bad prio class %d\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* + * No prio set, inherit CPU scheduling settings. + */ + bfqq->entity.new_ioprio = task_nice_ioprio(tsk); + bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); + break; + case IOPRIO_CLASS_RT: + bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; + break; + case IOPRIO_CLASS_BE: + bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_IDLE: + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; + bfqq->entity.new_ioprio = 7; + bfq_clear_bfqq_idle_window(bfqq); + break; + } + + if (bfqq->entity.new_ioprio < 0 || + bfqq->entity.new_ioprio >= IOPRIO_BE_NR) { + printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n", + bfqq->entity.new_ioprio); + BUG(); + } + + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio); + bfqq->entity.ioprio_changed = 1; +} + +static void bfq_check_ioprio_change(struct bfq_io_cq *bic) +{ + struct bfq_data *bfqd; + struct bfq_queue *bfqq, *new_bfqq; + struct bfq_group *bfqg; + unsigned long uninitialized_var(flags); + int ioprio = bic->icq.ioc->ioprio; + + bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), + &flags); + /* + * This condition may trigger on a newly created bic, be sure to + * drop the lock before returning. + */ + if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio)) + goto out; + + bic->ioprio = ioprio; + + bfqq = bic->bfqq[BLK_RW_ASYNC]; + if (bfqq != NULL) { + bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, + sched_data); + new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic, + GFP_ATOMIC); + if (new_bfqq != NULL) { + bic->bfqq[BLK_RW_ASYNC] = new_bfqq; + bfq_log_bfqq(bfqd, bfqq, + "check_ioprio_change: bfqq %p %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } + } + + bfqq = bic->bfqq[BLK_RW_SYNC]; + if (bfqq != NULL) + bfq_set_next_ioprio_data(bfqq, bic); + +out: + bfq_put_bfqd_unlock(bfqd, &flags); +} + +static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, pid_t pid, int is_sync) +{ + RB_CLEAR_NODE(&bfqq->entity.rb_node); + INIT_LIST_HEAD(&bfqq->fifo); + INIT_HLIST_NODE(&bfqq->burst_list_node); + + atomic_set(&bfqq->ref, 0); + bfqq->bfqd = bfqd; + + if (bic) + bfq_set_next_ioprio_data(bfqq, bic); + + if (is_sync) { + if (!bfq_class_idle(bfqq)) + bfq_mark_bfqq_idle_window(bfqq); + bfq_mark_bfqq_sync(bfqq); + } + bfq_mark_bfqq_IO_bound(bfqq); + + /* Tentative initial value to trade off between thr and lat */ + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; + bfqq->pid = pid; + + bfqq->wr_coeff = 1; + bfqq->last_wr_start_finish = 0; + /* + * Set to the value for which bfqq will not be deemed as + * soft rt when it becomes backlogged. + */ + bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); +} + +static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int is_sync, + struct bfq_io_cq *bic, + gfp_t gfp_mask) +{ + struct bfq_queue *bfqq, *new_bfqq = NULL; + +retry: + /* bic always exists here */ + bfqq = bic_to_bfqq(bic, is_sync); + + /* + * Always try a new alloc if we fall back to the OOM bfqq + * originally, since it should just be a temporary situation. + */ + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + bfqq = NULL; + if (new_bfqq != NULL) { + bfqq = new_bfqq; + new_bfqq = NULL; + } else if (gfp_mask & __GFP_WAIT) { + spin_unlock_irq(bfqd->queue->queue_lock); + new_bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + spin_lock_irq(bfqd->queue->queue_lock); + if (new_bfqq != NULL) + goto retry; + } else { + bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + } + + if (bfqq != NULL) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, + is_sync); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + } + } + + if (new_bfqq != NULL) + kmem_cache_free(bfq_pool, new_bfqq); + + return bfqq; +} + +static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int ioprio_class, int ioprio) +{ + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + return &bfqg->async_bfqq[0][ioprio]; + case IOPRIO_CLASS_NONE: + ioprio = IOPRIO_NORM; + /* fall through */ + case IOPRIO_CLASS_BE: + return &bfqg->async_bfqq[1][ioprio]; + case IOPRIO_CLASS_IDLE: + return &bfqg->async_idle_bfqq; + default: + BUG(); + } +} + +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, int is_sync, + struct bfq_io_cq *bic, gfp_t gfp_mask) +{ + const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + struct bfq_queue **async_bfqq = NULL; + struct bfq_queue *bfqq = NULL; + + if (!is_sync) { + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, + ioprio); + bfqq = *async_bfqq; + } + + if (bfqq == NULL) + bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask); + + /* + * Pin the queue now that it's allocated, scheduler exit will + * prune it. + */ + if (!is_sync && *async_bfqq == NULL) { + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", + bfqq, atomic_read(&bfqq->ref)); + *async_bfqq = bfqq; + } + + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, + atomic_read(&bfqq->ref)); + return bfqq; +} + +static void bfq_update_io_thinktime(struct bfq_data *bfqd, + struct bfq_io_cq *bic) +{ + unsigned long elapsed = jiffies - bic->ttime.last_end_request; + unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); + + bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; + bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; + bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / + bic->ttime.ttime_samples; +} + +static void bfq_update_io_seektime(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *rq) +{ + sector_t sdist; + u64 total; + + if (bfqq->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - bfqq->last_request_pos; + else + sdist = bfqq->last_request_pos - blk_rq_pos(rq); + + /* + * Don't allow the seek distance to get too large from the + * odd fragment, pagein, etc. + */ + if (bfqq->seek_samples == 0) /* first request, not really a seek */ + sdist = 0; + else if (bfqq->seek_samples <= 60) /* second & third seek */ + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); + else + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); + + bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; + bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; + total = bfqq->seek_total + (bfqq->seek_samples/2); + do_div(total, bfqq->seek_samples); + bfqq->seek_mean = (sector_t)total; + + bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, + (u64)bfqq->seek_mean); +} + +/* + * Disable idle window if the process thinks too long or seeks so much that + * it doesn't matter. + */ +static void bfq_update_idle_window(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + int enable_idle; + + /* Don't idle for async or idle io prio class. */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) + return; + + /* Idle window just restored, statistics are meaningless. */ + if (bfq_bfqq_just_split(bfqq)) + return; + + enable_idle = bfq_bfqq_idle_window(bfqq); + + if (atomic_read(&bic->icq.ioc->active_ref) == 0 || + bfqd->bfq_slice_idle == 0 || + (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && + bfqq->wr_coeff == 1)) + enable_idle = 0; + else if (bfq_sample_valid(bic->ttime.ttime_samples)) { + if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && + bfqq->wr_coeff == 1) + enable_idle = 0; + else + enable_idle = 1; + } + bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", + enable_idle); + + if (enable_idle) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); +} + +/* + * Called when a new fs request (rq) is added to bfqq. Check if there's + * something we should do about it. + */ +static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + struct bfq_io_cq *bic = RQ_BIC(rq); + + if (rq->cmd_flags & REQ_META) + bfqq->meta_pending++; + + bfq_update_io_thinktime(bfqd, bic); + bfq_update_io_seektime(bfqd, bfqq, rq); + if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { + bfq_clear_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || + !BFQQ_SEEKY(bfqq)) + bfq_update_idle_window(bfqd, bfqq, bic); + bfq_clear_bfqq_just_split(bfqq); + + bfq_log_bfqq(bfqd, bfqq, + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), + (long long unsigned)bfqq->seek_mean); + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + + if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { + int small_req = bfqq->queued[rq_is_sync(rq)] == 1 && + blk_rq_sectors(rq) < 32; + int budget_timeout = bfq_bfqq_budget_timeout(bfqq); + + /* + * There is just this request queued: if the request + * is small and the queue is not to be expired, then + * just exit. + * + * In this way, if the disk is being idled to wait for + * a new request from the in-service queue, we avoid + * unplugging the device and committing the disk to serve + * just a small request. On the contrary, we wait for + * the block layer to decide when to unplug the device: + * hopefully, new requests will be merged to this one + * quickly, then the device will be unplugged and + * larger requests will be dispatched. + */ + if (small_req && !budget_timeout) + return; + + /* + * A large enough request arrived, or the queue is to + * be expired: in both cases disk idling is to be + * stopped, so clear wait_request flag and reset + * timer. + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); + + /* + * The queue is not empty, because a new request just + * arrived. Hence we can safely expire the queue, in + * case of budget timeout, without risking that the + * timestamps of the queue are not updated correctly. + * See [1] for more details. + */ + if (budget_timeout) + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); + + /* + * Let the request rip immediately, or let a new queue be + * selected if bfqq has just been expired. + */ + __blk_run_queue(bfqd->queue); + } +} + +static void bfq_insert_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + + assert_spin_locked(bfqd->queue->queue_lock); + + /* + * An unplug may trigger a requeue of a request from the device + * driver: make sure we are in process context while trying to + * merge two bfq_queues. + */ + if (!in_interrupt()) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + if (new_bfqq != NULL) { + if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) + new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); + /* + * Release the request's reference to the old bfqq + * and make sure one is taken to the shared queue. + */ + new_bfqq->allocated[rq_data_dir(rq)]++; + bfqq->allocated[rq_data_dir(rq)]--; + atomic_inc(&new_bfqq->ref); + bfq_put_queue(bfqq); + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); + rq->elv.priv[1] = new_bfqq; + bfqq = new_bfqq; + } else + bfq_bfqq_increase_failed_cooperations(bfqq); + } + + bfq_add_request(rq); + + /* + * Here a newly-created bfq_queue has already started a weight-raising + * period: clear raising_time_left to prevent bfq_bfqq_save_state() + * from assigning it a full weight-raising period. See the detailed + * comments about this field in bfq_init_icq(). + */ + if (bfqq->bic != NULL) + bfqq->bic->wr_time_left = 0; + rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + list_add_tail(&rq->queuelist, &bfqq->fifo); + + bfq_rq_enqueued(bfqd, bfqq, rq); +} + +static void bfq_update_hw_tag(struct bfq_data *bfqd) +{ + bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, + bfqd->rq_in_driver); + + if (bfqd->hw_tag == 1) + return; + + /* + * This sample is valid if the number of outstanding requests + * is large enough to allow a queueing behavior. Note that the + * sum is not exact, as it's not taking into account deactivated + * requests. + */ + if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) + return; + + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) + return; + + bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; + bfqd->max_rq_in_driver = 0; + bfqd->hw_tag_samples = 0; +} + +static void bfq_completed_request(struct request_queue *q, struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + bool sync = bfq_bfqq_sync(bfqq); + + bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", + blk_rq_sectors(rq), sync); + + bfq_update_hw_tag(bfqd); + + BUG_ON(!bfqd->rq_in_driver); + BUG_ON(!bfqq->dispatched); + bfqd->rq_in_driver--; + bfqq->dispatched--; + + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->busy_in_flight_queues); + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) { + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + } + + if (sync) { + bfqd->sync_flight--; + RQ_BIC(rq)->ttime.last_end_request = jiffies; + } + + /* + * If we are waiting to discover whether the request pattern of the + * task associated with the queue is actually isochronous, and + * both requisites for this condition to hold are satisfied, then + * compute soft_rt_next_start (see the comments to the function + * bfq_bfqq_softrt_next_start()). + */ + if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + + /* + * If this is the in-service queue, check if it needs to be expired, + * or if we want to idle in case it has no pending requests. + */ + if (bfqd->in_service_queue == bfqq) { + if (bfq_bfqq_budget_new(bfqq)) + bfq_set_budget_timeout(bfqd); + + if (bfq_bfqq_must_idle(bfqq)) { + bfq_arm_slice_timer(bfqd); + goto out; + } else if (bfq_may_expire_for_budg_timeout(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); + else if (RB_EMPTY_ROOT(&bfqq->sort_list) && + (bfqq->dispatched == 0 || + !bfq_bfqq_must_not_expire(bfqq))) + bfq_bfqq_expire(bfqd, bfqq, 0, + BFQ_BFQQ_NO_MORE_REQUESTS); + } + + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + +out: + return; +} + +static inline int __bfq_may_queue(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { + bfq_clear_bfqq_must_alloc(bfqq); + return ELV_MQUEUE_MUST; + } + + return ELV_MQUEUE_MAY; +} + +static int bfq_may_queue(struct request_queue *q, int rw) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + /* + * Don't force setup of a queue from here, as a call to may_queue + * does not necessarily imply that a request actually will be + * queued. So just lookup a possibly existing queue, or return + * 'may queue' if that fails. + */ + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (bic == NULL) + return ELV_MQUEUE_MAY; + + bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); + if (bfqq != NULL) + return __bfq_may_queue(bfqq); + + return ELV_MQUEUE_MAY; +} + +/* + * Queue lock held here. + */ +static void bfq_put_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + if (bfqq != NULL) { + const int rw = rq_data_dir(rq); + + BUG_ON(!bfqq->allocated[rw]); + bfqq->allocated[rw]--; + + rq->elv.priv[0] = NULL; + rq->elv.priv[1] = NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } +} + +/* + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this + * was the last process referring to said bfqq. + */ +static struct bfq_queue * +bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); + + put_io_context(bic->icq.ioc); + + if (bfqq_process_refs(bfqq) == 1) { + bfqq->pid = current->pid; + bfq_clear_bfqq_coop(bfqq); + bfq_clear_bfqq_split_coop(bfqq); + return bfqq; + } + + bic_set_bfqq(bic, NULL, 1); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); + return NULL; +} + +/* + * Allocate bfq data structures associated with this request. + */ +static int bfq_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); + const int rw = rq_data_dir(rq); + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; + struct bfq_group *bfqg; + unsigned long flags; + bool split = false; + + might_sleep_if(gfp_mask & __GFP_WAIT); + + bfq_check_ioprio_change(bic); + + spin_lock_irqsave(q->queue_lock, flags); + + if (bic == NULL) + goto queue_fail; + + bfqg = bfq_bic_update_cgroup(bic); + +new_queue: + bfqq = bic_to_bfqq(bic, is_sync); + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { + if ((bic->was_in_burst_list && bfqd->large_burst) || + bic->saved_in_large_burst) + bfq_mark_bfqq_in_large_burst(bfqq); + else { + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) + hlist_add_head(&bfqq->burst_list_node, + &bfqd->burst_list); + } + } + } else { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + bfqq = bfq_split_bfqq(bic, bfqq); + split = true; + if (!bfqq) + goto new_queue; + } + } + + bfqq->allocated[rw]++; + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, + atomic_read(&bfqq->ref)); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; + + /* + * If a bfq_queue has only one process reference, it is owned + * by only one bfq_io_cq: we can set the bic field of the + * bfq_queue to the address of that structure. Also, if the + * queue has just been split, mark a flag so that the + * information is available to the other scheduler hooks. + */ + if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; + if (split) { + bfq_mark_bfqq_just_split(bfqq); + /* + * If the queue has just been split from a shared + * queue, restore the idle window and the possible + * weight raising period. + */ + bfq_bfqq_resume_state(bfqq, bic); + } + } + + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; + +queue_fail: + bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(q->queue_lock, flags); + + return 1; +} + +static void bfq_kick_queue(struct work_struct *work) +{ + struct bfq_data *bfqd = + container_of(work, struct bfq_data, unplug_work); + struct request_queue *q = bfqd->queue; + + spin_lock_irq(q->queue_lock); + __blk_run_queue(q); + spin_unlock_irq(q->queue_lock); +} + +/* + * Handler of the expiration of the timer running if the in-service queue + * is idling inside its time slice. + */ +static void bfq_idle_slice_timer(unsigned long data) +{ + struct bfq_data *bfqd = (struct bfq_data *)data; + struct bfq_queue *bfqq; + unsigned long flags; + enum bfqq_expiration reason; + + spin_lock_irqsave(bfqd->queue->queue_lock, flags); + + bfqq = bfqd->in_service_queue; + /* + * Theoretical race here: the in-service queue can be NULL or + * different from the queue that was idling if the timer handler + * spins on the queue_lock and a new request arrives for the + * current queue and there is a full dispatch cycle that changes + * the in-service queue. This can hardly happen, but in the worst + * case we just expire a queue too early. + */ + if (bfqq != NULL) { + bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); + if (bfq_bfqq_budget_timeout(bfqq)) + /* + * Also here the queue can be safely expired + * for budget timeout without wasting + * guarantees + */ + reason = BFQ_BFQQ_BUDGET_TIMEOUT; + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) + /* + * The queue may not be empty upon timer expiration, + * because we may not disable the timer when the + * first request of the in-service queue arrives + * during disk idling. + */ + reason = BFQ_BFQQ_TOO_IDLE; + else + goto schedule_dispatch; + + bfq_bfqq_expire(bfqd, bfqq, 1, reason); + } + +schedule_dispatch: + bfq_schedule_dispatch(bfqd); + + spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); +} + +static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) +{ + del_timer_sync(&bfqd->idle_slice_timer); + cancel_work_sync(&bfqd->unplug_work); +} + +static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_queue **bfqq_ptr) +{ + struct bfq_group *root_group = bfqd->root_group; + struct bfq_queue *bfqq = *bfqq_ptr; + + bfq_log(bfqd, "put_async_bfqq: %p", bfqq); + if (bfqq != NULL) { + bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + *bfqq_ptr = NULL; + } +} + +/* + * Release all the bfqg references to its async queues. If we are + * deallocating the group these queues may still contain requests, so + * we reparent them to the root cgroup (i.e., the only one that will + * exist for sure until all the requests on a device are gone). + */ +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); +} + +static void bfq_exit_queue(struct elevator_queue *e) +{ + struct bfq_data *bfqd = e->elevator_data; + struct request_queue *q = bfqd->queue; + struct bfq_queue *bfqq, *n; + + bfq_shutdown_timer_wq(bfqd); + + spin_lock_irq(q->queue_lock); + + BUG_ON(bfqd->in_service_queue != NULL); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, 0); + + bfq_disconnect_groups(bfqd); + spin_unlock_irq(q->queue_lock); + + bfq_shutdown_timer_wq(bfqd); + + synchronize_rcu(); + + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + + bfq_free_root_group(bfqd); + kfree(bfqd); +} + +static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) +{ + struct bfq_group *bfqg; + struct bfq_data *bfqd; + struct elevator_queue *eq; + + eq = elevator_alloc(q, e); + if (eq == NULL) + return -ENOMEM; + + bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); + if (bfqd == NULL) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = bfqd; + + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. + */ + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); + atomic_inc(&bfqd->oom_bfqq.ref); + bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; + bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE; + bfqd->oom_bfqq.entity.new_weight = + bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio); + /* + * Trigger weight initialization, according to ioprio, at the + * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio + * class won't be changed any more. + */ + bfqd->oom_bfqq.entity.ioprio_changed = 1; + + bfqd->queue = q; + + spin_lock_irq(q->queue_lock); + q->elevator = eq; + spin_unlock_irq(q->queue_lock); + + bfqg = bfq_alloc_root_group(bfqd, q->node); + if (bfqg == NULL) { + kfree(bfqd); + kobject_put(&eq->kobj); + return -ENOMEM; + } + + bfqd->root_group = bfqg; + bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); +#ifdef CONFIG_CGROUP_BFQIO + bfqd->active_numerous_groups = 0; +#endif + + init_timer(&bfqd->idle_slice_timer); + bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + bfqd->idle_slice_timer.data = (unsigned long)bfqd; + + bfqd->rq_pos_tree = RB_ROOT; + bfqd->queue_weights_tree = RB_ROOT; + bfqd->group_weights_tree = RB_ROOT; + + INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); + + INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->idle_list); + INIT_HLIST_HEAD(&bfqd->burst_list); + + bfqd->hw_tag = -1; + + bfqd->bfq_max_budget = bfq_default_max_budget; + + bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; + bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; + bfqd->bfq_back_max = bfq_back_max; + bfqd->bfq_back_penalty = bfq_back_penalty; + bfqd->bfq_slice_idle = bfq_slice_idle; + bfqd->bfq_class_idle_last_service = 0; + bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; + bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; + bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + + bfqd->bfq_coop_thresh = 2; + bfqd->bfq_failed_cooperations = 7000; + bfqd->bfq_requests_within_timer = 120; + + bfqd->bfq_large_burst_thresh = 11; + bfqd->bfq_burst_interval = msecs_to_jiffies(500); + + bfqd->low_latency = true; + + bfqd->bfq_wr_coeff = 20; + bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); + bfqd->bfq_wr_max_time = 0; + bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); + bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); + bfqd->bfq_wr_max_softrt_rate = 7000; /* + * Approximate rate required + * to playback or record a + * high-definition compressed + * video. + */ + bfqd->wr_busy_queues = 0; + bfqd->busy_in_flight_queues = 0; + bfqd->const_seeky_busy_in_flight_queues = 0; + + /* + * Begin by assuming, optimistically, that the device peak rate is + * equal to the highest reference rate. + */ + bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * + T_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->device_speed = BFQ_BFQD_FAST; + + return 0; +} + +static void bfq_slab_kill(void) +{ + if (bfq_pool != NULL) + kmem_cache_destroy(bfq_pool); +} + +static int __init bfq_slab_setup(void) +{ + bfq_pool = KMEM_CACHE(bfq_queue, 0); + if (bfq_pool == NULL) + return -ENOMEM; + return 0; +} + +static ssize_t bfq_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t bfq_var_store(unsigned long *var, const char *page, + size_t count) +{ + unsigned long new_val; + int ret = kstrtoul(page, 10, &new_val); + + if (ret == 0) + *var = new_val; + + return count; +} + +static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) +{ + struct bfq_data *bfqd = e->elevator_data; + return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? + jiffies_to_msecs(bfqd->bfq_wr_max_time) : + jiffies_to_msecs(bfq_wr_duration(bfqd))); +} + +static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) +{ + struct bfq_queue *bfqq; + struct bfq_data *bfqd = e->elevator_data; + ssize_t num_char = 0; + + num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", + bfqd->queued); + + spin_lock_irq(bfqd->queue->queue_lock); + + num_char += sprintf(page + num_char, "Active:\n"); + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + bfqq->queued[0], + bfqq->queued[1], + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + num_char += sprintf(page + num_char, "Idle:\n"); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + jiffies_to_msecs(jiffies - + bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + spin_unlock_irq(bfqd->queue->queue_lock); + + return num_char; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return bfq_var_show(__data, (page)); \ +} +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); +SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); +SHOW_FUNCTION(bfq_max_budget_async_rq_show, + bfqd->bfq_max_budget_async_rq, 0); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); +SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); +SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); +SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); +SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); +SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, + 1); +SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t \ +__FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long uninitialized_var(__data); \ + int ret = bfq_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, + INT_MAX, 1); +STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, + INT_MAX, 1); +STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); +STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, + INT_MAX, 0); +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, + 1, INT_MAX, 0); +STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); +STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, + 1); +STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, + &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, + INT_MAX, 0); +#undef STORE_FUNCTION + +/* do nothing for the moment */ +static ssize_t bfq_weights_store(struct elevator_queue *e, + const char *page, size_t count) +{ + return count; +} + +static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) +{ + u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) + return bfq_calc_max_budget(bfqd->peak_rate, timeout); + else + return bfq_default_max_budget; +} + +static ssize_t bfq_max_budget_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data == 0) + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + else { + if (__data > INT_MAX) + __data = INT_MAX; + bfqd->bfq_max_budget = __data; + } + + bfqd->bfq_user_max_budget = __data; + + return ret; +} + +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data < 1) + __data = 1; + else if (__data > INT_MAX) + __data = INT_MAX; + + bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); + if (bfqd->bfq_user_max_budget == 0) + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + + return ret; +} + +static ssize_t bfq_low_latency_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + if (__data == 0 && bfqd->low_latency != 0) + bfq_end_wr(bfqd); + bfqd->low_latency = __data; + + return ret; +} + +#define BFQ_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) + +static struct elv_fs_entry bfq_attrs[] = { + BFQ_ATTR(fifo_expire_sync), + BFQ_ATTR(fifo_expire_async), + BFQ_ATTR(back_seek_max), + BFQ_ATTR(back_seek_penalty), + BFQ_ATTR(slice_idle), + BFQ_ATTR(max_budget), + BFQ_ATTR(max_budget_async_rq), + BFQ_ATTR(timeout_sync), + BFQ_ATTR(timeout_async), + BFQ_ATTR(low_latency), + BFQ_ATTR(wr_coeff), + BFQ_ATTR(wr_max_time), + BFQ_ATTR(wr_rt_max_time), + BFQ_ATTR(wr_min_idle_time), + BFQ_ATTR(wr_min_inter_arr_async), + BFQ_ATTR(wr_max_softrt_rate), + BFQ_ATTR(weights), + __ATTR_NULL +}; + +static struct elevator_type iosched_bfq = { + .ops = { + .elevator_merge_fn = bfq_merge, + .elevator_merged_fn = bfq_merged_request, + .elevator_merge_req_fn = bfq_merged_requests, + .elevator_allow_merge_fn = bfq_allow_merge, + .elevator_dispatch_fn = bfq_dispatch_requests, + .elevator_add_req_fn = bfq_insert_request, + .elevator_activate_req_fn = bfq_activate_request, + .elevator_deactivate_req_fn = bfq_deactivate_request, + .elevator_completed_req_fn = bfq_completed_request, + .elevator_former_req_fn = elv_rb_former_request, + .elevator_latter_req_fn = elv_rb_latter_request, + .elevator_init_icq_fn = bfq_init_icq, + .elevator_exit_icq_fn = bfq_exit_icq, + .elevator_set_req_fn = bfq_set_request, + .elevator_put_req_fn = bfq_put_request, + .elevator_may_queue_fn = bfq_may_queue, + .elevator_init_fn = bfq_init_queue, + .elevator_exit_fn = bfq_exit_queue, + }, + .icq_size = sizeof(struct bfq_io_cq), + .icq_align = __alignof__(struct bfq_io_cq), + .elevator_attrs = bfq_attrs, + .elevator_name = "bfq", + .elevator_owner = THIS_MODULE, +}; + +static int __init bfq_init(void) +{ + /* + * Can be 0 on HZ < 1000 setups. + */ + if (bfq_slice_idle == 0) + bfq_slice_idle = 1; + + if (bfq_timeout_async == 0) + bfq_timeout_async = 1; + + if (bfq_slab_setup()) + return -ENOMEM; + + /* + * Times to load large popular applications for the typical systems + * installed on the reference devices (see the comments before the + * definitions of the two arrays). + */ + T_slow[0] = msecs_to_jiffies(2600); + T_slow[1] = msecs_to_jiffies(1000); + T_fast[0] = msecs_to_jiffies(5500); + T_fast[1] = msecs_to_jiffies(2000); + + /* + * Thresholds that determine the switch between speed classes (see + * the comments before the definition of the array). + */ + device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; + device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; + + elv_register(&iosched_bfq); + pr_info("BFQ I/O-scheduler: v7r8"); + + return 0; +} + +static void __exit bfq_exit(void) +{ + elv_unregister(&iosched_bfq); + bfq_slab_kill(); +} + +module_init(bfq_init); +module_exit(bfq_exit); + +MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); +MODULE_LICENSE("GPL"); diff -Nur linux-4.2/block/bfq-sched.c linux-4.2-pck/block/bfq-sched.c --- linux-4.2/block/bfq-sched.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/block/bfq-sched.c 2015-09-08 00:48:22.218364355 -0300 @@ -0,0 +1,1180 @@ +/* + * BFQ: Hierarchical B-WF2Q+ scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + */ + +#ifdef CONFIG_CGROUP_BFQIO +#define for_each_entity(entity) \ + for (; entity != NULL; entity = entity->parent) + +#define for_each_entity_safe(entity, parent) \ + for (; entity && ({ parent = entity->parent; 1; }); entity = parent) + +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd); + +static inline void bfq_update_budget(struct bfq_entity *next_in_service) +{ + struct bfq_entity *bfqg_entity; + struct bfq_group *bfqg; + struct bfq_sched_data *group_sd; + + BUG_ON(next_in_service == NULL); + + group_sd = next_in_service->sched_data; + + bfqg = container_of(group_sd, struct bfq_group, sched_data); + /* + * bfq_group's my_entity field is not NULL only if the group + * is not the root group. We must not touch the root entity + * as it must never become an in-service entity. + */ + bfqg_entity = bfqg->my_entity; + if (bfqg_entity != NULL) + bfqg_entity->budget = next_in_service->budget; +} + +static int bfq_update_next_in_service(struct bfq_sched_data *sd) +{ + struct bfq_entity *next_in_service; + + if (sd->in_service_entity != NULL) + /* will update/requeue at the end of service */ + return 0; + + /* + * NOTE: this can be improved in many ways, such as returning + * 1 (and thus propagating upwards the update) only when the + * budget changes, or caching the bfqq that will be scheduled + * next from this subtree. By now we worry more about + * correctness than about performance... + */ + next_in_service = bfq_lookup_next_entity(sd, 0, NULL); + sd->next_in_service = next_in_service; + + if (next_in_service != NULL) + bfq_update_budget(next_in_service); + + return 1; +} + +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) +{ + BUG_ON(sd->next_in_service != entity); +} +#else +#define for_each_entity(entity) \ + for (; entity != NULL; entity = NULL) + +#define for_each_entity_safe(entity, parent) \ + for (parent = NULL; entity != NULL; entity = parent) + +static inline int bfq_update_next_in_service(struct bfq_sched_data *sd) +{ + return 0; +} + +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) +{ +} + +static inline void bfq_update_budget(struct bfq_entity *next_in_service) +{ +} +#endif + +/* + * Shift for timestamp calculations. This actually limits the maximum + * service allowed in one timestamp delta (small shift values increase it), + * the maximum total weight that can be used for the queues in the system + * (big shift values increase it), and the period of virtual time + * wraparounds. + */ +#define WFQ_SERVICE_SHIFT 22 + +/** + * bfq_gt - compare two timestamps. + * @a: first ts. + * @b: second ts. + * + * Return @a > @b, dealing with wrapping correctly. + */ +static inline int bfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = NULL; + + BUG_ON(entity == NULL); + + if (entity->my_sched_data == NULL) + bfqq = container_of(entity, struct bfq_queue, entity); + + return bfqq; +} + + +/** + * bfq_delta - map service into the virtual time domain. + * @service: amount of service. + * @weight: scale factor (weight of an entity or weight sum). + */ +static inline u64 bfq_delta(unsigned long service, + unsigned long weight) +{ + u64 d = (u64)service << WFQ_SERVICE_SHIFT; + + do_div(d, weight); + return d; +} + +/** + * bfq_calc_finish - assign the finish time to an entity. + * @entity: the entity to act upon. + * @service: the service to be charged to the entity. + */ +static inline void bfq_calc_finish(struct bfq_entity *entity, + unsigned long service) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(entity->weight == 0); + + entity->finish = entity->start + + bfq_delta(service, entity->weight); + + if (bfqq != NULL) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: start %llu, finish %llu, delta %llu", + entity->start, entity->finish, + bfq_delta(service, entity->weight)); + } +} + +/** + * bfq_entity_of - get an entity from a node. + * @node: the node field of the entity. + * + * Convert a node pointer to the relative entity. This is used only + * to simplify the logic of some functions and not as the generic + * conversion mechanism because, e.g., in the tree walking functions, + * the check for a %NULL value would be redundant. + */ +static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) +{ + struct bfq_entity *entity = NULL; + + if (node != NULL) + entity = rb_entry(node, struct bfq_entity, rb_node); + + return entity; +} + +/** + * bfq_extract - remove an entity from a tree. + * @root: the tree root. + * @entity: the entity to remove. + */ +static inline void bfq_extract(struct rb_root *root, + struct bfq_entity *entity) +{ + BUG_ON(entity->tree != root); + + entity->tree = NULL; + rb_erase(&entity->rb_node, root); +} + +/** + * bfq_idle_extract - extract an entity from the idle tree. + * @st: the service tree of the owning @entity. + * @entity: the entity being removed. + */ +static void bfq_idle_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *next; + + BUG_ON(entity->tree != &st->idle); + + if (entity == st->first_idle) { + next = rb_next(&entity->rb_node); + st->first_idle = bfq_entity_of(next); + } + + if (entity == st->last_idle) { + next = rb_prev(&entity->rb_node); + st->last_idle = bfq_entity_of(next); + } + + bfq_extract(&st->idle, entity); + + if (bfqq != NULL) + list_del(&bfqq->bfqq_list); +} + +/** + * bfq_insert - generic tree insertion. + * @root: tree root. + * @entity: entity to insert. + * + * This is used for the idle and the active tree, since they are both + * ordered by finish time. + */ +static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) +{ + struct bfq_entity *entry; + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + + BUG_ON(entity->tree != NULL); + + while (*node != NULL) { + parent = *node; + entry = rb_entry(parent, struct bfq_entity, rb_node); + + if (bfq_gt(entry->finish, entity->finish)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&entity->rb_node, parent, node); + rb_insert_color(&entity->rb_node, root); + + entity->tree = root; +} + +/** + * bfq_update_min - update the min_start field of a entity. + * @entity: the entity to update. + * @node: one of its children. + * + * This function is called when @entity may store an invalid value for + * min_start due to updates to the active tree. The function assumes + * that the subtree rooted at @node (which may be its left or its right + * child) has a valid min_start value. + */ +static inline void bfq_update_min(struct bfq_entity *entity, + struct rb_node *node) +{ + struct bfq_entity *child; + + if (node != NULL) { + child = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entity->min_start, child->min_start)) + entity->min_start = child->min_start; + } +} + +/** + * bfq_update_active_node - recalculate min_start. + * @node: the node to update. + * + * @node may have changed position or one of its children may have moved, + * this function updates its min_start value. The left and right subtrees + * are assumed to hold a correct min_start value. + */ +static inline void bfq_update_active_node(struct rb_node *node) +{ + struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); + + entity->min_start = entity->start; + bfq_update_min(entity, node->rb_right); + bfq_update_min(entity, node->rb_left); +} + +/** + * bfq_update_active_tree - update min_start for the whole active tree. + * @node: the starting node. + * + * @node must be the deepest modified node after an update. This function + * updates its min_start using the values held by its children, assuming + * that they did not change, and then updates all the nodes that may have + * changed in the path to the root. The only nodes that may have changed + * are the ones in the path or their siblings. + */ +static void bfq_update_active_tree(struct rb_node *node) +{ + struct rb_node *parent; + +up: + bfq_update_active_node(node); + + parent = rb_parent(node); + if (parent == NULL) + return; + + if (node == parent->rb_left && parent->rb_right != NULL) + bfq_update_active_node(parent->rb_right); + else if (parent->rb_left != NULL) + bfq_update_active_node(parent->rb_left); + + node = parent; + goto up; +} + +static void bfq_weights_tree_add(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); + +static void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); + + +/** + * bfq_active_insert - insert an entity in the active tree of its + * group/device. + * @st: the service tree of the entity. + * @entity: the entity being inserted. + * + * The active tree is ordered by finish time, but an extra key is kept + * per each node, containing the minimum value for the start times of + * its children (and the node itself), so it's possible to search for + * the eligible node with the lowest finish time in logarithmic time. + */ +static void bfq_active_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node = &entity->rb_node; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + bfq_insert(&st->active, entity); + + if (node->rb_left != NULL) + node = node->rb_left; + else if (node->rb_right != NULL) + node = node->rb_right; + + bfq_update_active_tree(node); + +#ifdef CONFIG_CGROUP_BFQIO + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq != NULL) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); +#ifdef CONFIG_CGROUP_BFQIO + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); + } + if (bfqg != bfqd->root_group) { + BUG_ON(!bfqg); + BUG_ON(!bfqd); + bfqg->active_entities++; + if (bfqg->active_entities == 2) + bfqd->active_numerous_groups++; + } +#endif +} + +/** + * bfq_ioprio_to_weight - calc a weight from an ioprio. + * @ioprio: the ioprio value to convert. + */ +static inline unsigned short bfq_ioprio_to_weight(int ioprio) +{ + BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); + return IOPRIO_BE_NR - ioprio; +} + +/** + * bfq_weight_to_ioprio - calc an ioprio from a weight. + * @weight: the weight value to convert. + * + * To preserve as mush as possible the old only-ioprio user interface, + * 0 is used as an escape ioprio value for weights (numerically) equal or + * larger than IOPRIO_BE_NR + */ +static inline unsigned short bfq_weight_to_ioprio(int weight) +{ + BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); + return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; +} + +static inline void bfq_get_entity(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + if (bfqq != NULL) { + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", + bfqq, atomic_read(&bfqq->ref)); + } +} + +/** + * bfq_find_deepest - find the deepest node that an extraction can modify. + * @node: the node being removed. + * + * Do the first step of an extraction in an rb tree, looking for the + * node that will replace @node, and returning the deepest node that + * the following modifications to the tree can touch. If @node is the + * last node in the tree return %NULL. + */ +static struct rb_node *bfq_find_deepest(struct rb_node *node) +{ + struct rb_node *deepest; + + if (node->rb_right == NULL && node->rb_left == NULL) + deepest = rb_parent(node); + else if (node->rb_right == NULL) + deepest = node->rb_left; + else if (node->rb_left == NULL) + deepest = node->rb_right; + else { + deepest = rb_next(node); + if (deepest->rb_right != NULL) + deepest = deepest->rb_right; + else if (rb_parent(deepest) != node) + deepest = rb_parent(deepest); + } + + return deepest; +} + +/** + * bfq_active_extract - remove an entity from the active tree. + * @st: the service_tree containing the tree. + * @entity: the entity being removed. + */ +static void bfq_active_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + node = bfq_find_deepest(&entity->rb_node); + bfq_extract(&st->active, entity); + + if (node != NULL) + bfq_update_active_tree(node); + +#ifdef CONFIG_CGROUP_BFQIO + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq != NULL) + list_del(&bfqq->bfqq_list); +#ifdef CONFIG_CGROUP_BFQIO + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_remove(bfqd, entity, + &bfqd->group_weights_tree); + } + if (bfqg != bfqd->root_group) { + BUG_ON(!bfqg); + BUG_ON(!bfqd); + BUG_ON(!bfqg->active_entities); + bfqg->active_entities--; + if (bfqg->active_entities == 1) { + BUG_ON(!bfqd->active_numerous_groups); + bfqd->active_numerous_groups--; + } + } +#endif +} + +/** + * bfq_idle_insert - insert an entity into the idle tree. + * @st: the service tree containing the tree. + * @entity: the entity to insert. + */ +static void bfq_idle_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) + st->first_idle = entity; + if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) + st->last_idle = entity; + + bfq_insert(&st->idle, entity); + + if (bfqq != NULL) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); +} + +/** + * bfq_forget_entity - remove an entity from the wfq trees. + * @st: the service tree. + * @entity: the entity being removed. + * + * Update the device status and forget everything about @entity, putting + * the device reference to it, if it is a queue. Entities belonging to + * groups are not refcounted. + */ +static void bfq_forget_entity(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_sched_data *sd; + + BUG_ON(!entity->on_st); + + entity->on_st = 0; + st->wsum -= entity->weight; + if (bfqq != NULL) { + sd = entity->sched_data; + bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } +} + +/** + * bfq_put_idle_entity - release the idle tree ref of an entity. + * @st: service tree for the entity. + * @entity: the entity being released. + */ +static void bfq_put_idle_entity(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + bfq_idle_extract(st, entity); + bfq_forget_entity(st, entity); +} + +/** + * bfq_forget_idle - update the idle tree if necessary. + * @st: the service tree to act upon. + * + * To preserve the global O(log N) complexity we only remove one entry here; + * as the idle tree will not grow indefinitely this can be done safely. + */ +static void bfq_forget_idle(struct bfq_service_tree *st) +{ + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && + !bfq_gt(last_idle->finish, st->vtime)) { + /* + * Forget the whole idle tree, increasing the vtime past + * the last finish time of idle entities. + */ + st->vtime = last_idle->finish; + } + + if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) + bfq_put_idle_entity(st, first_idle); +} + +static struct bfq_service_tree * +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + struct bfq_entity *entity) +{ + struct bfq_service_tree *new_st = old_st; + + if (entity->ioprio_changed) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned short prev_weight, new_weight; + struct bfq_data *bfqd = NULL; + struct rb_root *root; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd; + struct bfq_group *bfqg; +#endif + + if (bfqq != NULL) + bfqd = bfqq->bfqd; +#ifdef CONFIG_CGROUP_BFQIO + else { + sd = entity->my_sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; + BUG_ON(!bfqd); + } +#endif + + BUG_ON(old_st->wsum < entity->weight); + old_st->wsum -= entity->weight; + + if (entity->new_weight != entity->orig_weight) { + if (entity->new_weight < BFQ_MIN_WEIGHT || + entity->new_weight > BFQ_MAX_WEIGHT) { + printk(KERN_CRIT "update_weight_prio: " + "new_weight %d\n", + entity->new_weight); + BUG(); + } + entity->orig_weight = entity->new_weight; + entity->ioprio = + bfq_weight_to_ioprio(entity->orig_weight); + } + + entity->ioprio_class = entity->new_ioprio_class; + entity->ioprio_changed = 0; + + /* + * NOTE: here we may be changing the weight too early, + * this will cause unfairness. The correct approach + * would have required additional complexity to defer + * weight changes to the proper time instants (i.e., + * when entity->finish <= old_st->vtime). + */ + new_st = bfq_entity_service_tree(entity); + + prev_weight = entity->weight; + new_weight = entity->orig_weight * + (bfqq != NULL ? bfqq->wr_coeff : 1); + /* + * If the weight of the entity changes, remove the entity + * from its old weight counter (if there is a counter + * associated with the entity), and add it to the counter + * associated with its new weight. + */ + if (prev_weight != new_weight) { + root = bfqq ? &bfqd->queue_weights_tree : + &bfqd->group_weights_tree; + bfq_weights_tree_remove(bfqd, entity, root); + } + entity->weight = new_weight; + /* + * Add the entity to its weights tree only if it is + * not associated with a weight-raised queue. + */ + if (prev_weight != new_weight && + (bfqq ? bfqq->wr_coeff == 1 : 1)) + /* If we get here, root has been initialized. */ + bfq_weights_tree_add(bfqd, entity, root); + + new_st->wsum += entity->weight; + + if (new_st != old_st) + entity->start = new_st->vtime; + } + + return new_st; +} + +/** + * bfq_bfqq_served - update the scheduler status after selection for + * service. + * @bfqq: the queue being served. + * @served: bytes to transfer. + * + * NOTE: this can be optimized, as the timestamps of upper level entities + * are synchronized every time a new bfqq is selected for service. By now, + * we keep it to better check consistency. + */ +static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st; + + for_each_entity(entity) { + st = bfq_entity_service_tree(entity); + + entity->service += served; + BUG_ON(entity->service > entity->budget); + BUG_ON(st->wsum == 0); + + st->vtime += bfq_delta(served, st->wsum); + bfq_forget_idle(st); + } + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); +} + +/** + * bfq_bfqq_charge_full_budget - set the service to the entity budget. + * @bfqq: the queue that needs a service update. + * + * When it's not possible to be fair in the service domain, because + * a queue is not consuming its budget fast enough (the meaning of + * fast depends on the timeout parameter), we charge it a full + * budget. In this way we should obtain a sort of time-domain + * fairness among all the seeky/slow queues. + */ +static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); + + bfq_bfqq_served(bfqq, entity->budget - entity->service); +} + +/** + * __bfq_activate_entity - activate an entity. + * @entity: the entity being activated. + * + * Called whenever an entity is activated, i.e., it is not active and one + * of its children receives a new request, or has to be reactivated due to + * budget exhaustion. It uses the current budget of the entity (and the + * service received if @entity is active) of the queue to calculate its + * timestamps. + */ +static void __bfq_activate_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + if (entity == sd->in_service_entity) { + BUG_ON(entity->tree != NULL); + /* + * If we are requeueing the current entity we have + * to take care of not charging to it service it has + * not received. + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; + sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) { + /* + * Requeueing an entity due to a change of some + * next_in_service entity below it. We reuse the + * old start time. + */ + bfq_active_extract(st, entity); + } else if (entity->tree == &st->idle) { + /* + * Must be on the idle tree, bfq_idle_extract() will + * check for that. + */ + bfq_idle_extract(st, entity); + entity->start = bfq_gt(st->vtime, entity->finish) ? + st->vtime : entity->finish; + } else { + /* + * The finish time of the entity may be invalid, and + * it is in the past for sure, otherwise the queue + * would have been on the idle tree. + */ + entity->start = st->vtime; + st->wsum += entity->weight; + bfq_get_entity(entity); + + BUG_ON(entity->on_st); + entity->on_st = 1; + } + + st = __bfq_entity_update_weight_prio(st, entity); + bfq_calc_finish(entity, entity->budget); + bfq_active_insert(st, entity); +} + +/** + * bfq_activate_entity - activate an entity and its ancestors if necessary. + * @entity: the entity to activate. + * + * Activate @entity and all the entities on the path from it to the root. + */ +static void bfq_activate_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd; + + for_each_entity(entity) { + __bfq_activate_entity(entity); + + sd = entity->sched_data; + if (!bfq_update_next_in_service(sd)) + /* + * No need to propagate the activation to the + * upper entities, as they will be updated when + * the in-service entity is rescheduled. + */ + break; + } +} + +/** + * __bfq_deactivate_entity - deactivate an entity from its service tree. + * @entity: the entity to deactivate. + * @requeue: if false, the entity will not be put into the idle tree. + * + * Deactivate an entity, independently from its previous state. If the + * entity was not on a service tree just return, otherwise if it is on + * any scheduler tree, extract it from that tree, and if necessary + * and if the caller did not specify @requeue, put it on the idle tree. + * + * Return %1 if the caller should update the entity hierarchy, i.e., + * if the entity was in service or if it was the next_in_service for + * its sched_data; return %0 otherwise. + */ +static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + int was_in_service = entity == sd->in_service_entity; + int ret = 0; + + if (!entity->on_st) + return 0; + + BUG_ON(was_in_service && entity->tree != NULL); + + if (was_in_service) { + bfq_calc_finish(entity, entity->service); + sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) + bfq_active_extract(st, entity); + else if (entity->tree == &st->idle) + bfq_idle_extract(st, entity); + else if (entity->tree != NULL) + BUG(); + + if (was_in_service || sd->next_in_service == entity) + ret = bfq_update_next_in_service(sd); + + if (!requeue || !bfq_gt(entity->finish, st->vtime)) + bfq_forget_entity(st, entity); + else + bfq_idle_insert(st, entity); + + BUG_ON(sd->in_service_entity == entity); + BUG_ON(sd->next_in_service == entity); + + return ret; +} + +/** + * bfq_deactivate_entity - deactivate an entity. + * @entity: the entity to deactivate. + * @requeue: true if the entity can be put on the idle tree + */ +static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +{ + struct bfq_sched_data *sd; + struct bfq_entity *parent; + + for_each_entity_safe(entity, parent) { + sd = entity->sched_data; + + if (!__bfq_deactivate_entity(entity, requeue)) + /* + * The parent entity is still backlogged, and + * we don't need to update it as it is still + * in service. + */ + break; + + if (sd->next_in_service != NULL) + /* + * The parent entity is still backlogged and + * the budgets on the path towards the root + * need to be updated. + */ + goto update; + + /* + * If we reach there the parent is no more backlogged and + * we want to propagate the dequeue upwards. + */ + requeue = 1; + } + + return; + +update: + entity = parent; + for_each_entity(entity) { + __bfq_activate_entity(entity); + + sd = entity->sched_data; + if (!bfq_update_next_in_service(sd)) + break; + } +} + +/** + * bfq_update_vtime - update vtime if necessary. + * @st: the service tree to act upon. + * + * If necessary update the service tree vtime to have at least one + * eligible entity, skipping to its start time. Assumes that the + * active tree of the device is not empty. + * + * NOTE: this hierarchical implementation updates vtimes quite often, + * we may end up with reactivated processes getting timestamps after a + * vtime skip done because we needed a ->first_active entity on some + * intermediate node. + */ +static void bfq_update_vtime(struct bfq_service_tree *st) +{ + struct bfq_entity *entry; + struct rb_node *node = st->active.rb_node; + + entry = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entry->min_start, st->vtime)) { + st->vtime = entry->min_start; + bfq_forget_idle(st); + } +} + +/** + * bfq_first_active_entity - find the eligible entity with + * the smallest finish time + * @st: the service tree to select from. + * + * This function searches the first schedulable entity, starting from the + * root of the tree and going on the left every time on this side there is + * a subtree with at least one eligible (start >= vtime) entity. The path on + * the right is followed only if a) the left subtree contains no eligible + * entities and b) no eligible entity has been found yet. + */ +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) +{ + struct bfq_entity *entry, *first = NULL; + struct rb_node *node = st->active.rb_node; + + while (node != NULL) { + entry = rb_entry(node, struct bfq_entity, rb_node); +left: + if (!bfq_gt(entry->start, st->vtime)) + first = entry; + + BUG_ON(bfq_gt(entry->min_start, st->vtime)); + + if (node->rb_left != NULL) { + entry = rb_entry(node->rb_left, + struct bfq_entity, rb_node); + if (!bfq_gt(entry->min_start, st->vtime)) { + node = node->rb_left; + goto left; + } + } + if (first != NULL) + break; + node = node->rb_right; + } + + BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); + return first; +} + +/** + * __bfq_lookup_next_entity - return the first eligible entity in @st. + * @st: the service tree. + * + * Update the virtual time in @st and return the first eligible entity + * it contains. + */ +static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, + bool force) +{ + struct bfq_entity *entity, *new_next_in_service = NULL; + + if (RB_EMPTY_ROOT(&st->active)) + return NULL; + + bfq_update_vtime(st); + entity = bfq_first_active_entity(st); + BUG_ON(bfq_gt(entity->start, st->vtime)); + + /* + * If the chosen entity does not match with the sched_data's + * next_in_service and we are forcedly serving the IDLE priority + * class tree, bubble up budget update. + */ + if (unlikely(force && entity != entity->sched_data->next_in_service)) { + new_next_in_service = entity; + for_each_entity(new_next_in_service) + bfq_update_budget(new_next_in_service); + } + + return entity; +} + +/** + * bfq_lookup_next_entity - return the first eligible entity in @sd. + * @sd: the sched_data. + * @extract: if true the returned entity will be also extracted from @sd. + * + * NOTE: since we cache the next_in_service entity at each level of the + * hierarchy, the complexity of the lookup can be decreased with + * absolutely no effort just returning the cached next_in_service value; + * we prefer to do full lookups to test the consistency of * the data + * structures. + */ +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd) +{ + struct bfq_service_tree *st = sd->service_tree; + struct bfq_entity *entity; + int i = 0; + + BUG_ON(sd->in_service_entity != NULL); + + if (bfqd != NULL && + jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { + entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, + true); + if (entity != NULL) { + i = BFQ_IOPRIO_CLASSES - 1; + bfqd->bfq_class_idle_last_service = jiffies; + sd->next_in_service = entity; + } + } + for (; i < BFQ_IOPRIO_CLASSES; i++) { + entity = __bfq_lookup_next_entity(st + i, false); + if (entity != NULL) { + if (extract) { + bfq_check_next_in_service(sd, entity); + bfq_active_extract(st + i, entity); + sd->in_service_entity = entity; + sd->next_in_service = NULL; + } + break; + } + } + + return entity; +} + +/* + * Get next queue for service. + */ +static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) +{ + struct bfq_entity *entity = NULL; + struct bfq_sched_data *sd; + struct bfq_queue *bfqq; + + BUG_ON(bfqd->in_service_queue != NULL); + + if (bfqd->busy_queues == 0) + return NULL; + + sd = &bfqd->root_group->sched_data; + for (; sd != NULL; sd = entity->my_sched_data) { + entity = bfq_lookup_next_entity(sd, 1, bfqd); + BUG_ON(entity == NULL); + entity->service = 0; + } + + bfqq = bfq_entity_to_bfqq(entity); + BUG_ON(bfqq == NULL); + + return bfqq; +} + +static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) +{ + if (bfqd->in_service_bic != NULL) { + put_io_context(bfqd->in_service_bic->icq.ioc); + bfqd->in_service_bic = NULL; + } + + bfqd->in_service_queue = NULL; + del_timer(&bfqd->idle_slice_timer); +} + +static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int requeue) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfqq == bfqd->in_service_queue) + __bfq_bfqd_reset_in_service(bfqd); + + bfq_deactivate_entity(entity, requeue); +} + +static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_activate_entity(entity); +} + +/* + * Called when the bfqq no longer has requests pending, remove it from + * the service tree. + */ +static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int requeue) +{ + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + bfq_log_bfqq(bfqd, bfqq, "del from busy"); + + bfq_clear_bfqq_busy(bfqq); + + BUG_ON(bfqd->busy_queues == 0); + bfqd->busy_queues--; + + if (!bfqq->dispatched) { + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->busy_in_flight_queues); + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) { + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + } + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues--; + + bfq_deactivate_bfqq(bfqd, bfqq, requeue); +} + +/* + * Called when an inactive queue receives a new request. + */ +static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqq == bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "add to busy"); + + bfq_activate_bfqq(bfqd, bfqq); + + bfq_mark_bfqq_busy(bfqq); + bfqd->busy_queues++; + + if (!bfqq->dispatched) { + if (bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + bfqd->busy_in_flight_queues++; + if (bfq_bfqq_constantly_seeky(bfqq)) + bfqd->const_seeky_busy_in_flight_queues++; + } + } + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues++; +} diff -Nur linux-4.2/block/bfq.h linux-4.2-pck/block/bfq.h --- linux-4.2/block/bfq.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/block/bfq.h 2015-09-08 00:48:22.218364355 -0300 @@ -0,0 +1,807 @@ +/* + * BFQ-v7r8 for 4.1.0: data structures and common functions prototypes. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + */ + +#ifndef _BFQ_H +#define _BFQ_H + +#include +#include +#include +#include + +#define BFQ_IOPRIO_CLASSES 3 +#define BFQ_CL_IDLE_TIMEOUT (HZ/5) + +#define BFQ_MIN_WEIGHT 1 +#define BFQ_MAX_WEIGHT 1000 + +#define BFQ_DEFAULT_QUEUE_IOPRIO 4 + +#define BFQ_DEFAULT_GRP_WEIGHT 10 +#define BFQ_DEFAULT_GRP_IOPRIO 0 +#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE + +struct bfq_entity; + +/** + * struct bfq_service_tree - per ioprio_class service tree. + * @active: tree for active entities (i.e., those backlogged). + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). + * @first_idle: idle entity with minimum F_i. + * @last_idle: idle entity with maximum F_i. + * @vtime: scheduler virtual time. + * @wsum: scheduler weight sum; active and idle entities contribute to it. + * + * Each service tree represents a B-WF2Q+ scheduler on its own. Each + * ioprio_class has its own independent scheduler, and so its own + * bfq_service_tree. All the fields are protected by the queue lock + * of the containing bfqd. + */ +struct bfq_service_tree { + struct rb_root active; + struct rb_root idle; + + struct bfq_entity *first_idle; + struct bfq_entity *last_idle; + + u64 vtime; + unsigned long wsum; +}; + +/** + * struct bfq_sched_data - multi-class scheduler. + * @in_service_entity: entity in service. + * @next_in_service: head-of-the-line entity in the scheduler. + * @service_tree: array of service trees, one per ioprio_class. + * + * bfq_sched_data is the basic scheduler queue. It supports three + * ioprio_classes, and can be used either as a toplevel queue or as + * an intermediate queue on a hierarchical setup. + * @next_in_service points to the active entity of the sched_data + * service trees that will be scheduled next. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. + * Requests from higher priority queues are served before all the + * requests from lower priority queues; among requests of the same + * queue requests are served according to B-WF2Q+. + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_sched_data { + struct bfq_entity *in_service_entity; + struct bfq_entity *next_in_service; + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; +}; + +/** + * struct bfq_weight_counter - counter of the number of all active entities + * with a given weight. + * @weight: weight of the entities that this counter refers to. + * @num_active: number of active entities with this weight. + * @weights_node: weights tree member (see bfq_data's @queue_weights_tree + * and @group_weights_tree). + */ +struct bfq_weight_counter { + short int weight; + unsigned int num_active; + struct rb_node weights_node; +}; + +/** + * struct bfq_entity - schedulable entity. + * @rb_node: service_tree member. + * @weight_counter: pointer to the weight counter associated with this entity. + * @on_st: flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree). + * @finish: B-WF2Q+ finish timestamp (aka F_i). + * @start: B-WF2Q+ start timestamp (aka S_i). + * @tree: tree the entity is enqueued into; %NULL if not on a tree. + * @min_start: minimum start time of the (active) subtree rooted at + * this entity; used for O(log N) lookups into active trees. + * @service: service received during the last round of service. + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. + * @weight: weight of the queue + * @parent: parent entity, for hierarchical scheduling. + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the + * associated scheduler queue, %NULL on leaf nodes. + * @sched_data: the scheduler queue this entity belongs to. + * @ioprio: the ioprio in use. + * @new_weight: when a weight change is requested, the new weight value. + * @orig_weight: original weight, used to implement weight boosting + * @new_ioprio: when an ioprio change is requested, the new ioprio value. + * @ioprio_class: the ioprio_class in use. + * @new_ioprio_class: when an ioprio_class change is requested, the new + * ioprio_class value. + * @ioprio_changed: flag, true when the user requested a weight, ioprio or + * ioprio_class change. + * + * A bfq_entity is used to represent either a bfq_queue (leaf node in the + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each + * entity belongs to the sched_data of the parent group in the cgroup + * hierarchy. Non-leaf entities have also their own sched_data, stored + * in @my_sched_data. + * + * Each entity stores independently its priority values; this would + * allow different weights on different devices, but this + * functionality is not exported to userspace by now. Priorities and + * weights are updated lazily, first storing the new values into the + * new_* fields, then setting the @ioprio_changed flag. As soon as + * there is a transition in the entity state that allows the priority + * update to take place the effective and the requested priority + * values are synchronized. + * + * Unless cgroups are used, the weight value is calculated from the + * ioprio to export the same interface as CFQ. When dealing with + * ``well-behaved'' queues (i.e., queues that do not spend too much + * time to consume their budget and have true sequential behavior, and + * when there are no external factors breaking anticipation) the + * relative weights at each level of the cgroups hierarchy should be + * guaranteed. All the fields are protected by the queue lock of the + * containing bfqd. + */ +struct bfq_entity { + struct rb_node rb_node; + struct bfq_weight_counter *weight_counter; + + int on_st; + + u64 finish; + u64 start; + + struct rb_root *tree; + + u64 min_start; + + unsigned long service, budget; + unsigned short weight, new_weight; + unsigned short orig_weight; + + struct bfq_entity *parent; + + struct bfq_sched_data *my_sched_data; + struct bfq_sched_data *sched_data; + + unsigned short ioprio, new_ioprio; + unsigned short ioprio_class, new_ioprio_class; + + int ioprio_changed; +}; + +struct bfq_group; + +/** + * struct bfq_queue - leaf schedulable entity. + * @ref: reference counter. + * @bfqd: parent bfq_data. + * @new_bfqq: shared bfq_queue if queue is cooperating with + * one or more other queues. + * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). + * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). + * @sort_list: sorted list of pending requests. + * @next_rq: if fifo isn't expired, next request to serve. + * @queued: nr of requests queued in @sort_list. + * @allocated: currently allocated requests. + * @meta_pending: pending metadata requests. + * @fifo: fifo list of requests in sort_list. + * @entity: entity representing this queue in the scheduler. + * @max_budget: maximum budget allowed from the feedback mechanism. + * @budget_timeout: budget expiration (in jiffies). + * @dispatched: number of requests on the dispatch list or inside driver. + * @flags: status flags. + * @bfqq_list: node for active/idle bfqq list inside our bfqd. + * @burst_list_node: node for the device's burst list. + * @seek_samples: number of seeks sampled + * @seek_total: sum of the distances of the seeks sampled + * @seek_mean: mean seek distance + * @last_request_pos: position of the last request enqueued + * @requests_within_timer: number of consecutive pairs of request completion + * and arrival, such that the queue becomes idle + * after the completion, but the next request arrives + * within an idle time slice; used only if the queue's + * IO_bound has been cleared. + * @pid: pid of the process owning the queue, used for logging purposes. + * @last_wr_start_finish: start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period + * @wr_cur_max_time: current max raising time for this queue + * @soft_rt_next_start: minimum time instant such that, only if a new + * request is enqueued after this time instant in an + * idle @bfq_queue with no outstanding requests, then + * the task associated with the queue it is deemed as + * soft real-time (see the comments to the function + * bfq_bfqq_softrt_next_start()) + * @last_idle_bklogged: time of the last transition of the @bfq_queue from + * idle to backlogged + * @service_from_backlogged: cumulative service received from the @bfq_queue + * since the last transition from idle to + * backlogged + * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the + * queue is shared + * + * A bfq_queue is a leaf request queue; it can be associated with an + * io_context or more, if it is async or shared between cooperating + * processes. @cgroup holds a reference to the cgroup, to be sure that it + * does not disappear while a bfqq still references it (mostly to avoid + * races between request issuing and task migration followed by cgroup + * destruction). + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_queue { + atomic_t ref; + struct bfq_data *bfqd; + + /* fields for cooperating queues handling */ + struct bfq_queue *new_bfqq; + struct rb_node pos_node; + struct rb_root *pos_root; + + struct rb_root sort_list; + struct request *next_rq; + int queued[2]; + int allocated[2]; + int meta_pending; + struct list_head fifo; + + struct bfq_entity entity; + + unsigned long max_budget; + unsigned long budget_timeout; + + int dispatched; + + unsigned int flags; + + struct list_head bfqq_list; + + struct hlist_node burst_list_node; + + unsigned int seek_samples; + u64 seek_total; + sector_t seek_mean; + sector_t last_request_pos; + + unsigned int requests_within_timer; + + pid_t pid; + struct bfq_io_cq *bic; + + /* weight-raising fields */ + unsigned long wr_cur_max_time; + unsigned long soft_rt_next_start; + unsigned long last_wr_start_finish; + unsigned int wr_coeff; + unsigned long last_idle_bklogged; + unsigned long service_from_backlogged; +}; + +/** + * struct bfq_ttime - per process thinktime stats. + * @ttime_total: total process thinktime + * @ttime_samples: number of thinktime samples + * @ttime_mean: average process thinktime + */ +struct bfq_ttime { + unsigned long last_end_request; + + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; +}; + +/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + * @icq: associated io_cq structure + * @bfqq: array of two process queues, the sync and the async + * @ttime: associated @bfq_ttime struct + * @wr_time_left: snapshot of the time left before weight raising ends + * for the sync queue associated to this process; this + * snapshot is taken to remember this value while the weight + * raising is suspended because the queue is merged with a + * shared queue, and is used to set @raising_cur_max_time + * when the queue is split from the shared queue and its + * weight is raised again + * @saved_idle_window: same purpose as the previous field for the idle + * window + * @saved_IO_bound: same purpose as the previous two fields for the I/O + * bound classification of a queue + * @saved_in_large_burst: same purpose as the previous fields for the + * value of the field keeping the queue's belonging + * to a large burst + * @was_in_burst_list: true if the queue belonged to a burst list + * before its merge with another cooperating queue + * @cooperations: counter of consecutive successful queue merges underwent + * by any of the process' @bfq_queues + * @failed_cooperations: counter of consecutive failed queue merges of any + * of the process' @bfq_queues + */ +struct bfq_io_cq { + struct io_cq icq; /* must be the first member */ + struct bfq_queue *bfqq[2]; + struct bfq_ttime ttime; + int ioprio; + + unsigned int wr_time_left; + bool saved_idle_window; + bool saved_IO_bound; + + bool saved_in_large_burst; + bool was_in_burst_list; + + unsigned int cooperations; + unsigned int failed_cooperations; +}; + +enum bfq_device_speed { + BFQ_BFQD_FAST, + BFQ_BFQD_SLOW, +}; + +/** + * struct bfq_data - per device data structure. + * @queue: request queue for the managed device. + * @root_group: root bfq_group for the device. + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_close_cooperator()). + * @active_numerous_groups: number of bfq_groups containing more than one + * active @bfq_entity. + * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by + * weight. Used to keep track of whether all @bfq_queues + * have the same weight. The tree contains one counter + * for each distinct weight associated to some active + * and not weight-raised @bfq_queue (see the comments to + * the functions bfq_weights_tree_[add|remove] for + * further details). + * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted + * by weight. Used to keep track of whether all + * @bfq_groups have the same weight. The tree contains + * one counter for each distinct weight associated to + * some active @bfq_group (see the comments to the + * functions bfq_weights_tree_[add|remove] for further + * details). + * @busy_queues: number of bfq_queues containing requests (including the + * queue in service, even if it is idling). + * @busy_in_flight_queues: number of @bfq_queues containing pending or + * in-flight requests, plus the @bfq_queue in + * service, even if idle but waiting for the + * possible arrival of its next sync request. This + * field is updated only if the device is rotational, + * but used only if the device is also NCQ-capable. + * The reason why the field is updated also for non- + * NCQ-capable rotational devices is related to the + * fact that the value of @hw_tag may be set also + * later than when busy_in_flight_queues may need to + * be incremented for the first time(s). Taking also + * this possibility into account, to avoid unbalanced + * increments/decrements, would imply more overhead + * than just updating busy_in_flight_queues + * regardless of the value of @hw_tag. + * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues + * (that is, seeky queues that expired + * for budget timeout at least once) + * containing pending or in-flight + * requests, including the in-service + * @bfq_queue if constantly seeky. This + * field is updated only if the device + * is rotational, but used only if the + * device is also NCQ-capable (see the + * comments to @busy_in_flight_queues). + * @wr_busy_queues: number of weight-raised busy @bfq_queues. + * @queued: number of queued requests. + * @rq_in_driver: number of requests dispatched and waiting for completion. + * @sync_flight: number of sync requests in the driver. + * @max_rq_in_driver: max number of reqs in driver in the last + * @hw_tag_samples completed requests. + * @hw_tag_samples: nr of samples used to calculate hw_tag. + * @hw_tag: flag set to one if the driver is showing a queueing behavior. + * @budgets_assigned: number of budgets assigned. + * @idle_slice_timer: timer set when idling for the next sequential request + * from the queue in service. + * @unplug_work: delayed work to restart dispatching on the request queue. + * @in_service_queue: bfq_queue in service. + * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. + * @last_position: on-disk position of the last served request. + * @last_budget_start: beginning of the last budget. + * @last_idling_start: beginning of the last idle slice. + * @peak_rate: peak transfer rate observed for a budget. + * @peak_rate_samples: number of samples used to calculate @peak_rate. + * @bfq_max_budget: maximum budget allotted to a bfq_queue before + * rescheduling. + * @group_list: list of all the bfq_groups active on the device. + * @active_list: list of all the bfq_queues active on the device. + * @idle_list: list of all the bfq_queues idle on the device. + * @bfq_fifo_expire: timeout for async/sync requests; when it expires + * requests are served in fifo order. + * @bfq_back_penalty: weight of backward seeks wrt forward ones. + * @bfq_back_max: maximum allowed backward seek. + * @bfq_slice_idle: maximum idling time. + * @bfq_user_max_budget: user-configured max budget value + * (0 for auto-tuning). + * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to + * async queues. + * @bfq_timeout: timeout for bfq_queues to consume their budget; used to + * to prevent seeky queues to impose long latencies to well + * behaved ones (this also implies that seeky queues cannot + * receive guarantees in the service domain; after a timeout + * they are charged for the whole allocated budget, to try + * to preserve a behavior reasonably fair among them, but + * without service-domain guarantees). + * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is + * no more granted any weight-raising. + * @bfq_failed_cooperations: number of consecutive failed cooperation + * chances after which weight-raising is restored + * to a queue subject to more than bfq_coop_thresh + * queue merges. + * @bfq_requests_within_timer: number of consecutive requests that must be + * issued within the idle time slice to set + * again idling to a queue which was marked as + * non-I/O-bound (see the definition of the + * IO_bound flag for further details). + * @last_ins_in_burst: last time at which a queue entered the current + * burst of queues being activated shortly after + * each other; for more details about this and the + * following parameters related to a burst of + * activations, see the comments to the function + * @bfq_handle_burst. + * @bfq_burst_interval: reference time interval used to decide whether a + * queue has been activated shortly after + * @last_ins_in_burst. + * @burst_size: number of queues in the current burst of queue activations. + * @bfq_large_burst_thresh: maximum burst size above which the current + * queue-activation burst is deemed as 'large'. + * @large_burst: true if a large queue-activation burst is in progress. + * @burst_list: head of the burst list (as for the above fields, more details + * in the comments to the function bfq_handle_burst). + * @low_latency: if set to true, low-latency heuristics are enabled. + * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised + * queue is multiplied. + * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). + * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. + * @bfq_wr_min_idle_time: minimum idle period after which weight-raising + * may be reactivated for a queue (in jiffies). + * @bfq_wr_min_inter_arr_async: minimum period between request arrivals + * after which weight-raising may be + * reactivated for an already busy queue + * (in jiffies). + * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, + * sectors per seconds. + * @RT_prod: cached value of the product R*T used for computing the maximum + * duration of the weight raising automatically. + * @device_speed: device-speed class for the low-latency heuristic. + * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. + * + * All the fields are protected by the @queue lock. + */ +struct bfq_data { + struct request_queue *queue; + + struct bfq_group *root_group; + struct rb_root rq_pos_tree; + +#ifdef CONFIG_CGROUP_BFQIO + int active_numerous_groups; +#endif + + struct rb_root queue_weights_tree; + struct rb_root group_weights_tree; + + int busy_queues; + int busy_in_flight_queues; + int const_seeky_busy_in_flight_queues; + int wr_busy_queues; + int queued; + int rq_in_driver; + int sync_flight; + + int max_rq_in_driver; + int hw_tag_samples; + int hw_tag; + + int budgets_assigned; + + struct timer_list idle_slice_timer; + struct work_struct unplug_work; + + struct bfq_queue *in_service_queue; + struct bfq_io_cq *in_service_bic; + + sector_t last_position; + + ktime_t last_budget_start; + ktime_t last_idling_start; + int peak_rate_samples; + u64 peak_rate; + unsigned long bfq_max_budget; + + struct hlist_head group_list; + struct list_head active_list; + struct list_head idle_list; + + unsigned int bfq_fifo_expire[2]; + unsigned int bfq_back_penalty; + unsigned int bfq_back_max; + unsigned int bfq_slice_idle; + u64 bfq_class_idle_last_service; + + unsigned int bfq_user_max_budget; + unsigned int bfq_max_budget_async_rq; + unsigned int bfq_timeout[2]; + + unsigned int bfq_coop_thresh; + unsigned int bfq_failed_cooperations; + unsigned int bfq_requests_within_timer; + + unsigned long last_ins_in_burst; + unsigned long bfq_burst_interval; + int burst_size; + unsigned long bfq_large_burst_thresh; + bool large_burst; + struct hlist_head burst_list; + + bool low_latency; + + /* parameters of the low_latency heuristics */ + unsigned int bfq_wr_coeff; + unsigned int bfq_wr_max_time; + unsigned int bfq_wr_rt_max_time; + unsigned int bfq_wr_min_idle_time; + unsigned long bfq_wr_min_inter_arr_async; + unsigned int bfq_wr_max_softrt_rate; + u64 RT_prod; + enum bfq_device_speed device_speed; + + struct bfq_queue oom_bfqq; +}; + +enum bfqq_state_flags { + BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ + BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ + BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ + BFQ_BFQQ_FLAG_IO_bound, /* + * bfqq has timed-out at least once + * having consumed at most 2/10 of + * its budget + */ + BFQ_BFQQ_FLAG_in_large_burst, /* + * bfqq activated in a large burst, + * see comments to bfq_handle_burst. + */ + BFQ_BFQQ_FLAG_constantly_seeky, /* + * bfqq has proved to be slow and + * seeky until budget timeout + */ + BFQ_BFQQ_FLAG_softrt_update, /* + * may need softrt-next-start + * update + */ + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ + BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ +}; + +#define BFQ_BFQQ_FNS(name) \ +static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ +{ \ + return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ +} + +BFQ_BFQQ_FNS(busy); +BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(must_alloc); +BFQ_BFQQ_FNS(fifo_expire); +BFQ_BFQQ_FNS(idle_window); +BFQ_BFQQ_FNS(sync); +BFQ_BFQQ_FNS(budget_new); +BFQ_BFQQ_FNS(IO_bound); +BFQ_BFQQ_FNS(in_large_burst); +BFQ_BFQQ_FNS(constantly_seeky); +BFQ_BFQQ_FNS(coop); +BFQ_BFQQ_FNS(split_coop); +BFQ_BFQQ_FNS(just_split); +BFQ_BFQQ_FNS(softrt_update); +#undef BFQ_BFQQ_FNS + +/* Logging facilities. */ +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) + +#define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) + +/* Expiration reasons. */ +enum bfqq_expiration { + BFQ_BFQQ_TOO_IDLE = 0, /* + * queue has been idling for + * too long + */ + BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ + BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ + BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ +}; + +#ifdef CONFIG_CGROUP_BFQIO +/** + * struct bfq_group - per (device, cgroup) data structure. + * @entity: schedulable entity to insert into the parent group sched_data. + * @sched_data: own sched_data, to contain child entities (they may be + * both bfq_queues and bfq_groups). + * @group_node: node to be inserted into the bfqio_cgroup->group_data + * list of the containing cgroup's bfqio_cgroup. + * @bfqd_node: node to be inserted into the @bfqd->group_list list + * of the groups active on the same device; used for cleanup. + * @bfqd: the bfq_data for the device this group acts upon. + * @async_bfqq: array of async queues for all the tasks belonging to + * the group, one queue per ioprio value per ioprio_class, + * except for the idle class that has only one queue. + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). + * @my_entity: pointer to @entity, %NULL for the toplevel group; used + * to avoid too many special cases during group creation/ + * migration. + * @active_entities: number of active entities belonging to the group; + * unused for the root group. Used to know whether there + * are groups with more than one active @bfq_entity + * (see the comments to the function + * bfq_bfqq_must_not_expire()). + * + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup + * there is a set of bfq_groups, each one collecting the lower-level + * entities belonging to the group that are acting on the same device. + * + * Locking works as follows: + * o @group_node is protected by the bfqio_cgroup lock, and is accessed + * via RCU from its readers. + * o @bfqd is protected by the queue lock, RCU is used to access it + * from the readers. + * o All the other fields are protected by the @bfqd queue lock. + */ +struct bfq_group { + struct bfq_entity entity; + struct bfq_sched_data sched_data; + + struct hlist_node group_node; + struct hlist_node bfqd_node; + + void *bfqd; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; + + struct bfq_entity *my_entity; + + int active_entities; +}; + +/** + * struct bfqio_cgroup - bfq cgroup data structure. + * @css: subsystem state for bfq in the containing cgroup. + * @online: flag marked when the subsystem is inserted. + * @weight: cgroup weight. + * @ioprio: cgroup ioprio. + * @ioprio_class: cgroup ioprio_class. + * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. + * @group_data: list containing the bfq_group belonging to this cgroup. + * + * @group_data is accessed using RCU, with @lock protecting the updates, + * @ioprio and @ioprio_class are protected by @lock. + */ +struct bfqio_cgroup { + struct cgroup_subsys_state css; + bool online; + + unsigned short weight, ioprio, ioprio_class; + + spinlock_t lock; + struct hlist_head group_data; +}; +#else +struct bfq_group { + struct bfq_sched_data sched_data; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; +}; +#endif + +static inline struct bfq_service_tree * +bfq_entity_service_tree(struct bfq_entity *entity) +{ + struct bfq_sched_data *sched_data = entity->sched_data; + unsigned int idx = entity->ioprio_class - 1; + + BUG_ON(idx >= BFQ_IOPRIO_CLASSES); + BUG_ON(sched_data == NULL); + + return sched_data->service_tree + idx; +} + +static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, + bool is_sync) +{ + return bic->bfqq[is_sync]; +} + +static inline void bic_set_bfqq(struct bfq_io_cq *bic, + struct bfq_queue *bfqq, bool is_sync) +{ + bic->bfqq[is_sync] = bfqq; +} + +static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) +{ + return bic->icq.q->elevator->elevator_data; +} + +/** + * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. + * @ptr: a pointer to a bfqd. + * @flags: storage for the flags to be saved. + * + * This function allows bfqg->bfqd to be protected by the + * queue lock of the bfqd they reference; the pointer is dereferenced + * under RCU, so the storage for bfqd is assured to be safe as long + * as the RCU read side critical section does not end. After the + * bfqd->queue->queue_lock is taken the pointer is rechecked, to be + * sure that no other writer accessed it. If we raced with a writer, + * the function returns NULL, with the queue unlocked, otherwise it + * returns the dereferenced pointer, with the queue locked. + */ +static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, + unsigned long *flags) +{ + struct bfq_data *bfqd; + + rcu_read_lock(); + bfqd = rcu_dereference(*(struct bfq_data **)ptr); + + if (bfqd != NULL) { + spin_lock_irqsave(bfqd->queue->queue_lock, *flags); + if (*ptr == bfqd) + goto out; + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); + } + + bfqd = NULL; +out: + rcu_read_unlock(); + return bfqd; +} + +static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, + unsigned long *flags) +{ + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); +} + +static void bfq_check_ioprio_change(struct bfq_io_cq *bic); +static void bfq_put_queue(struct bfq_queue *bfqq); +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, int is_sync, + struct bfq_io_cq *bic, gfp_t gfp_mask); +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg); +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); + +#endif /* _BFQ_H */ diff -Nur linux-4.2/block/blk-core.c linux-4.2-pck/block/blk-core.c --- linux-4.2/block/blk-core.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/block/blk-core.c 2015-09-08 11:20:31.960370046 -0300 @@ -48,6 +48,8 @@ DEFINE_IDA(blk_queue_ida); +int trap_non_toi_io; + /* * For the allocated request tables */ @@ -1988,6 +1990,9 @@ { bio->bi_rw |= rw; + if (unlikely(trap_non_toi_io)) + BUG_ON(!(bio->bi_flags & (1 << BIO_TOI))); + /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. diff -Nur linux-4.2/block/genhd.c linux-4.2-pck/block/genhd.c --- linux-4.2/block/genhd.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/block/genhd.c 2015-09-08 11:20:31.977035898 -0300 @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include @@ -1384,6 +1386,85 @@ EXPORT_SYMBOL(invalidate_partition); +dev_t blk_lookup_fs_info(struct fs_info *seek) +{ + dev_t devt = MKDEV(0, 0); + struct class_dev_iter iter; + struct device *dev; + int best_score = 0; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while (best_score < 3 && (dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct disk_part_iter piter; + struct hd_struct *part; + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); + + while (best_score < 3 && (part = disk_part_iter_next(&piter))) { + int score = part_matches_fs_info(part, seek); + if (score > best_score) { + devt = part_devt(part); + best_score = score; + } + } + disk_part_iter_exit(&piter); + } + class_dev_iter_exit(&iter); + return devt; +} + +/* Caller uses NULL, key to start. For each match found, we return a bdev on + * which we have done blkdev_get, and we do the blkdev_put on block devices + * that are passed to us. When no more matches are found, we return NULL. + */ +struct block_device *next_bdev_of_type(struct block_device *last, + const char *key) +{ + dev_t devt = MKDEV(0, 0); + struct class_dev_iter iter; + struct device *dev; + struct block_device *next = NULL, *bdev; + int got_last = 0; + + if (!key) + goto out; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while (!devt && (dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct disk_part_iter piter; + struct hd_struct *part; + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); + + while ((part = disk_part_iter_next(&piter))) { + bdev = bdget(part_devt(part)); + if (last && !got_last) { + if (last == bdev) + got_last = 1; + continue; + } + + if (blkdev_get(bdev, FMODE_READ, 0)) + continue; + + if (bdev_matches_key(bdev, key)) { + next = bdev; + break; + } + + blkdev_put(bdev, FMODE_READ); + } + disk_part_iter_exit(&piter); + } + class_dev_iter_exit(&iter); +out: + if (last) + blkdev_put(last, FMODE_READ); + return next; +} + /* * Disk events - monitor disk events like media change and eject request. */ diff -Nur linux-4.2/block/uuid.c linux-4.2-pck/block/uuid.c --- linux-4.2/block/uuid.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/block/uuid.c 2015-09-08 11:20:31.977035898 -0300 @@ -0,0 +1,509 @@ +#include +#include +#include +#include +#include + +static int debug_enabled; + +#define PRINTK(fmt, args...) do { \ + if (debug_enabled) \ + printk(KERN_DEBUG fmt, ## args); \ + } while(0) + +#define PRINT_HEX_DUMP(v1, v2, v3, v4, v5, v6, v7, v8) \ + do { \ + if (debug_enabled) \ + print_hex_dump(v1, v2, v3, v4, v5, v6, v7, v8); \ + } while(0) + +/* + * Simple UUID translation + */ + +struct uuid_info { + const char *key; + const char *name; + long bkoff; + unsigned sboff; + unsigned sig_len; + const char *magic; + int uuid_offset; + int last_mount_offset; + int last_mount_size; +}; + +/* + * Based on libuuid's blkid_magic array. Note that I don't + * have uuid offsets for all of these yet - mssing ones are 0x0. + * Further information welcome. + * + * Rearranged by page of fs signature for optimisation. + */ +static struct uuid_info uuid_list[] = { + { NULL, "oracleasm", 0, 32, 8, "ORCLDISK", 0x0, 0, 0 }, + { "ntfs", "ntfs", 0, 3, 8, "NTFS ", 0x0, 0, 0 }, + { "vfat", "vfat", 0, 0x52, 5, "MSWIN", 0x0, 0, 0 }, + { "vfat", "vfat", 0, 0x52, 8, "FAT32 ", 0x0, 0, 0 }, + { "vfat", "vfat", 0, 0x36, 5, "MSDOS", 0x0, 0, 0 }, + { "vfat", "vfat", 0, 0x36, 8, "FAT16 ", 0x0, 0, 0 }, + { "vfat", "vfat", 0, 0x36, 8, "FAT12 ", 0x0, 0, 0 }, + { "vfat", "vfat", 0, 0, 1, "\353", 0x0, 0, 0 }, + { "vfat", "vfat", 0, 0, 1, "\351", 0x0, 0, 0 }, + { "vfat", "vfat", 0, 0x1fe, 2, "\125\252", 0x0, 0, 0 }, + { "xfs", "xfs", 0, 0, 4, "XFSB", 0x20, 0, 0 }, + { "romfs", "romfs", 0, 0, 8, "-rom1fs-", 0x0, 0, 0 }, + { "bfs", "bfs", 0, 0, 4, "\316\372\173\033", 0, 0, 0 }, + { "cramfs", "cramfs", 0, 0, 4, "E=\315\050", 0x0, 0, 0 }, + { "qnx4", "qnx4", 0, 4, 6, "QNX4FS", 0, 0, 0 }, + { NULL, "crypt_LUKS", 0, 0, 6, "LUKS\xba\xbe", 0x0, 0, 0 }, + { "squashfs", "squashfs", 0, 0, 4, "sqsh", 0, 0, 0 }, + { "squashfs", "squashfs", 0, 0, 4, "hsqs", 0, 0, 0 }, + { "ocfs", "ocfs", 0, 8, 9, "OracleCFS", 0x0, 0, 0 }, + { "lvm2pv", "lvm2pv", 0, 0x018, 8, "LVM2 001", 0x0, 0, 0 }, + { "sysv", "sysv", 0, 0x3f8, 4, "\020~\030\375", 0, 0, 0 }, + { "ext", "ext", 1, 0x38, 2, "\123\357", 0x468, 0x42c, 4 }, + { "minix", "minix", 1, 0x10, 2, "\177\023", 0, 0, 0 }, + { "minix", "minix", 1, 0x10, 2, "\217\023", 0, 0, 0 }, + { "minix", "minix", 1, 0x10, 2, "\150\044", 0, 0, 0 }, + { "minix", "minix", 1, 0x10, 2, "\170\044", 0, 0, 0 }, + { "lvm2pv", "lvm2pv", 1, 0x018, 8, "LVM2 001", 0x0, 0, 0 }, + { "vxfs", "vxfs", 1, 0, 4, "\365\374\001\245", 0, 0, 0 }, + { "hfsplus", "hfsplus", 1, 0, 2, "BD", 0x0, 0, 0 }, + { "hfsplus", "hfsplus", 1, 0, 2, "H+", 0x0, 0, 0 }, + { "hfsplus", "hfsplus", 1, 0, 2, "HX", 0x0, 0, 0 }, + { "hfs", "hfs", 1, 0, 2, "BD", 0x0, 0, 0 }, + { "ocfs2", "ocfs2", 1, 0, 6, "OCFSV2", 0x0, 0, 0 }, + { "lvm2pv", "lvm2pv", 0, 0x218, 8, "LVM2 001", 0x0, 0, 0 }, + { "lvm2pv", "lvm2pv", 1, 0x218, 8, "LVM2 001", 0x0, 0, 0 }, + { "ocfs2", "ocfs2", 2, 0, 6, "OCFSV2", 0x0, 0, 0 }, + { "swap", "swap", 0, 0xff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, + { "swap", "swap", 0, 0xff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0xff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0xff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0xff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, + { "ocfs2", "ocfs2", 4, 0, 6, "OCFSV2", 0x0, 0, 0 }, + { "ocfs2", "ocfs2", 8, 0, 6, "OCFSV2", 0x0, 0, 0 }, + { "hpfs", "hpfs", 8, 0, 4, "I\350\225\371", 0, 0, 0 }, + { "reiserfs", "reiserfs", 8, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 }, + { "reiserfs", "reiserfs", 8, 20, 8, "ReIsErFs", 0x10054, 0, 0 }, + { "zfs", "zfs", 8, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 }, + { "zfs", "zfs", 8, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 }, + { "ufs", "ufs", 8, 0x55c, 4, "T\031\001\000", 0, 0, 0 }, + { "swap", "swap", 0, 0x1ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, + { "swap", "swap", 0, 0x1ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0x1ff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0x1ff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0x1ff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, + { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr2Fs", 0x10054, 0, 0 }, + { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr3Fs", 0x10054, 0, 0 }, + { "reiserfs", "reiserfs", 64, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 }, + { "reiser4", "reiser4", 64, 0, 7, "ReIsEr4", 0x100544, 0, 0 }, + { "gfs2", "gfs2", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 }, + { "gfs", "gfs", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 }, + { "btrfs", "btrfs", 64, 0x40, 8, "_BHRfS_M", 0x0, 0, 0 }, + { "swap", "swap", 0, 0x3ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, + { "swap", "swap", 0, 0x3ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0x3ff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0x3ff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0x3ff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, + { "udf", "udf", 32, 1, 5, "BEA01", 0x0, 0, 0 }, + { "udf", "udf", 32, 1, 5, "BOOT2", 0x0, 0, 0 }, + { "udf", "udf", 32, 1, 5, "CD001", 0x0, 0, 0 }, + { "udf", "udf", 32, 1, 5, "CDW02", 0x0, 0, 0 }, + { "udf", "udf", 32, 1, 5, "NSR02", 0x0, 0, 0 }, + { "udf", "udf", 32, 1, 5, "NSR03", 0x0, 0, 0 }, + { "udf", "udf", 32, 1, 5, "TEA01", 0x0, 0, 0 }, + { "iso9660", "iso9660", 32, 1, 5, "CD001", 0x0, 0, 0 }, + { "iso9660", "iso9660", 32, 9, 5, "CDROM", 0x0, 0, 0 }, + { "jfs", "jfs", 32, 0, 4, "JFS1", 0x88, 0, 0 }, + { "swap", "swap", 0, 0x7ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, + { "swap", "swap", 0, 0x7ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0x7ff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0x7ff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0x7ff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, + { "swap", "swap", 0, 0xfff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, + { "swap", "swap", 0, 0xfff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0xfff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0xfff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, + { "swap", "swsuspend", 0, 0xfff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, + { "zfs", "zfs", 264, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 }, + { "zfs", "zfs", 264, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 }, + { NULL, NULL, 0, 0, 0, NULL, 0x0, 0, 0 } +}; + +static int null_uuid(const char *uuid) +{ + int i; + + for (i = 0; i < 16 && !uuid[i]; i++); + + return (i == 16); +} + + +static void uuid_end_bio(struct bio *bio, int err) +{ + struct page *page = bio->bi_io_vec[0].bv_page; + + if(!test_bit(BIO_UPTODATE, &bio->bi_flags)) + SetPageError(page); + + unlock_page(page); + bio_put(bio); +} + + +/** + * submit - submit BIO request + * @dev: The block device we're using. + * @page_num: The page we're reading. + * + * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the + * textbook - allocate and initialize the bio. If we're writing, make sure + * the page is marked as dirty. Then submit it and carry on." + **/ +static struct page *read_bdev_page(struct block_device *dev, int page_num) +{ + struct bio *bio = NULL; + struct page *page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + + if (!page) { + printk(KERN_ERR "Failed to allocate a page for reading data " + "in UUID checks."); + return NULL; + } + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_bdev = dev; + bio->bi_iter.bi_sector = page_num << 3; + bio->bi_end_io = uuid_end_bio; + bio->bi_flags |= (1 << BIO_TOI); + + PRINTK("Submitting bio on device %lx, page %d using bio %p and page %p.\n", + (unsigned long) dev->bd_dev, page_num, bio, page); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk(KERN_DEBUG "ERROR: adding page to bio at %d\n", + page_num); + bio_put(bio); + __free_page(page); + printk(KERN_DEBUG "read_bdev_page freed page %p (in error " + "path).\n", page); + return NULL; + } + + lock_page(page); + submit_bio(READ | REQ_SYNC, bio); + + wait_on_page_locked(page); + if (PageError(page)) { + __free_page(page); + page = NULL; + } + return page; +} + +int bdev_matches_key(struct block_device *bdev, const char *key) +{ + unsigned char *data = NULL; + struct page *data_page = NULL; + + int dev_offset, pg_num, pg_off, i; + int last_pg_num = -1; + int result = 0; + char buf[50]; + + if (null_uuid(key)) { + PRINTK("Refusing to find a NULL key.\n"); + return 0; + } + + if (!bdev->bd_disk) { + bdevname(bdev, buf); + PRINTK("bdev %s has no bd_disk.\n", buf); + return 0; + } + + if (!bdev->bd_disk->queue) { + bdevname(bdev, buf); + PRINTK("bdev %s has no queue.\n", buf); + return 0; + } + + for (i = 0; uuid_list[i].name; i++) { + struct uuid_info *dat = &uuid_list[i]; + + if (!dat->key || strcmp(dat->key, key)) + continue; + + dev_offset = (dat->bkoff << 10) + dat->sboff; + pg_num = dev_offset >> 12; + pg_off = dev_offset & 0xfff; + + if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1) + continue; + + if (pg_num != last_pg_num) { + if (data_page) { + kunmap(data_page); + __free_page(data_page); + } + data_page = read_bdev_page(bdev, pg_num); + if (!data_page) + continue; + data = kmap(data_page); + } + + last_pg_num = pg_num; + + if (strncmp(&data[pg_off], dat->magic, dat->sig_len)) + continue; + + result = 1; + break; + } + + if (data_page) { + kunmap(data_page); + __free_page(data_page); + } + + return result; +} + +/* + * part_matches_fs_info - Does the given partition match the details given? + * + * Returns a score saying how good the match is. + * 0 = no UUID match. + * 1 = UUID but last mount time differs. + * 2 = UUID, last mount time but not dev_t + * 3 = perfect match + * + * This lets us cope elegantly with probing resulting in dev_ts changing + * from boot to boot, and with the case where a user copies a partition + * (UUID is non unique), and we need to check the last mount time of the + * correct partition. + */ +int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek) +{ + struct block_device *bdev; + struct fs_info *got; + int result = 0; + char buf[50]; + + if (null_uuid((char *) &seek->uuid)) { + PRINTK("Refusing to find a NULL uuid.\n"); + return 0; + } + + bdev = bdget(part_devt(part)); + + PRINTK("part_matches fs info considering %x.\n", part_devt(part)); + + if (blkdev_get(bdev, FMODE_READ, 0)) { + PRINTK("blkdev_get failed.\n"); + return 0; + } + + if (!bdev->bd_disk) { + bdevname(bdev, buf); + PRINTK("bdev %s has no bd_disk.\n", buf); + goto out; + } + + if (!bdev->bd_disk->queue) { + bdevname(bdev, buf); + PRINTK("bdev %s has no queue.\n", buf); + goto out; + } + + got = fs_info_from_block_dev(bdev); + + if (got && !memcmp(got->uuid, seek->uuid, 16)) { + PRINTK(" Have matching UUID.\n"); + PRINTK(" Got: LMS %d, LM %p.\n", got->last_mount_size, got->last_mount); + PRINTK(" Seek: LMS %d, LM %p.\n", seek->last_mount_size, seek->last_mount); + result = 1; + + if (got->last_mount_size == seek->last_mount_size && + got->last_mount && seek->last_mount && + !memcmp(got->last_mount, seek->last_mount, + got->last_mount_size)) { + result = 2; + + PRINTK(" Matching last mount time.\n"); + + if (part_devt(part) == seek->dev_t) { + result = 3; + PRINTK(" Matching dev_t.\n"); + } else + PRINTK("Dev_ts differ (%x vs %x).\n", part_devt(part), seek->dev_t); + } + } + + PRINTK(" Score for %x is %d.\n", part_devt(part), result); + free_fs_info(got); +out: + blkdev_put(bdev, FMODE_READ); + return result; +} + +void free_fs_info(struct fs_info *fs_info) +{ + if (!fs_info || IS_ERR(fs_info)) + return; + + if (fs_info->last_mount) + kfree(fs_info->last_mount); + + kfree(fs_info); +} + +struct fs_info *fs_info_from_block_dev(struct block_device *bdev) +{ + unsigned char *data = NULL; + struct page *data_page = NULL; + + int dev_offset, pg_num, pg_off; + int uuid_pg_num, uuid_pg_off, i; + unsigned char *uuid_data = NULL; + struct page *uuid_data_page = NULL; + + int last_pg_num = -1, last_uuid_pg_num = 0; + char buf[50]; + struct fs_info *fs_info = NULL; + + bdevname(bdev, buf); + + PRINTK("uuid_from_block_dev looking for partition type of %s.\n", buf); + + for (i = 0; uuid_list[i].name; i++) { + struct uuid_info *dat = &uuid_list[i]; + dev_offset = (dat->bkoff << 10) + dat->sboff; + pg_num = dev_offset >> 12; + pg_off = dev_offset & 0xfff; + uuid_pg_num = dat->uuid_offset >> 12; + uuid_pg_off = dat->uuid_offset & 0xfff; + + if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1) + continue; + + /* Ignore partition types with no UUID offset */ + if (!dat->uuid_offset) + continue; + + if (pg_num != last_pg_num) { + if (data_page) { + kunmap(data_page); + __free_page(data_page); + } + data_page = read_bdev_page(bdev, pg_num); + if (!data_page) + continue; + data = kmap(data_page); + } + + last_pg_num = pg_num; + + if (strncmp(&data[pg_off], dat->magic, dat->sig_len)) + continue; + + PRINTK("This partition looks like %s.\n", dat->name); + + fs_info = kzalloc(sizeof(struct fs_info), GFP_KERNEL); + + if (!fs_info) { + PRINTK("Failed to allocate fs_info struct."); + fs_info = ERR_PTR(-ENOMEM); + break; + } + + /* UUID can't be off the end of the disk */ + if ((uuid_pg_num > bdev->bd_part->nr_sects >> 3) || + !dat->uuid_offset) + goto no_uuid; + + if (!uuid_data || uuid_pg_num != last_uuid_pg_num) { + /* No need to reread the page from above */ + if (uuid_pg_num == pg_num && uuid_data) + memcpy(uuid_data, data, PAGE_SIZE); + else { + if (uuid_data_page) { + kunmap(uuid_data_page); + __free_page(uuid_data_page); + } + uuid_data_page = read_bdev_page(bdev, uuid_pg_num); + if (!uuid_data_page) + continue; + uuid_data = kmap(uuid_data_page); + } + } + + last_uuid_pg_num = uuid_pg_num; + memcpy(&fs_info->uuid, &uuid_data[uuid_pg_off], 16); + fs_info->dev_t = bdev->bd_dev; + +no_uuid: + PRINT_HEX_DUMP(KERN_EMERG, "fs_info_from_block_dev " + "returning uuid ", DUMP_PREFIX_NONE, 16, 1, + fs_info->uuid, 16, 0); + + if (dat->last_mount_size) { + int pg = dat->last_mount_offset >> 12, sz; + int off = dat->last_mount_offset & 0xfff; + struct page *last_mount = read_bdev_page(bdev, pg); + unsigned char *last_mount_data; + char *ptr; + + if (!last_mount) { + fs_info = ERR_PTR(-ENOMEM); + break; + } + last_mount_data = kmap(last_mount); + sz = dat->last_mount_size; + ptr = kmalloc(sz, GFP_KERNEL); + + if (!ptr) { + printk(KERN_EMERG "fs_info_from_block_dev " + "failed to get memory for last mount " + "timestamp."); + free_fs_info(fs_info); + fs_info = ERR_PTR(-ENOMEM); + } else { + fs_info->last_mount = ptr; + fs_info->last_mount_size = sz; + memcpy(ptr, &last_mount_data[off], sz); + } + + kunmap(last_mount); + __free_page(last_mount); + } + break; + } + + if (data_page) { + kunmap(data_page); + __free_page(data_page); + } + + if (uuid_data_page) { + kunmap(uuid_data_page); + __free_page(uuid_data_page); + } + + return fs_info; +} + +static int __init uuid_debug_setup(char *str) +{ + int value; + + if (sscanf(str, "=%d", &value)) + debug_enabled = value; + + return 1; +} + +__setup("uuid_debug", uuid_debug_setup); diff -Nur linux-4.2/drivers/accessibility/braille/braille_console.c linux-4.2-pck/drivers/accessibility/braille/braille_console.c --- linux-4.2/drivers/accessibility/braille/braille_console.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/accessibility/braille/braille_console.c 2015-09-08 00:48:22.218364355 -0300 @@ -116,7 +116,7 @@ *c++ = csum; *c++ = ETX; - braille_co->write(braille_co, data, c - data); + braille_co->write(braille_co, data, c - data, 0); } /* Follow the VC cursor*/ diff -Nur linux-4.2/drivers/acpi/blacklist.c linux-4.2-pck/drivers/acpi/blacklist.c --- linux-4.2/drivers/acpi/blacklist.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/acpi/blacklist.c 2015-09-08 00:48:22.218364355 -0300 @@ -317,6 +317,37 @@ }, /* + * The following Lenovo models have a broken workaround in the + * acpi_video backlight implementation to meet the Windows 8 + * requirement of 101 backlight levels. Reverting to pre-Win8 + * behavoir fixes the problem. + */ + { + .callback = dmi_disable_osi_win8, + .ident = "Lenovo ThinkPad T430", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T430"), + }, + }, + { + .callback = dmi_disable_osi_win8, + .ident = "Lenovo ThinkPad T430s", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T430s"), + }, + }, + { + .callback = dmi_disable_osi_win8, + .ident = "Lenovo ThinkPad X230", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad X230"), + }, + }, + + /* * BIOS invocation of _OSI(Linux) is almost always a BIOS bug. * Linux ignores it, except for the machines enumerated below. */ diff -Nur linux-4.2/drivers/ata/acard-ahci.c linux-4.2-pck/drivers/ata/acard-ahci.c --- linux-4.2/drivers/ata/acard-ahci.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/acard-ahci.c 2015-09-08 00:48:22.218364355 -0300 @@ -478,6 +478,9 @@ ata_port_pbar_desc(ap, AHCI_PCI_BAR, 0x100 + ap->port_no * 0x80, "port"); + rc = ahci_setup_port_privdata(ap); + if (rc) + return rc; /* set initial link pm policy */ /* ap->pm_policy = NOT_AVAILABLE; diff -Nur linux-4.2/drivers/ata/ahci.c linux-4.2-pck/drivers/ata/ahci.c --- linux-4.2/drivers/ata/ahci.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/ahci.c 2015-09-08 00:48:22.218364355 -0300 @@ -1597,6 +1597,9 @@ if (ap->flags & ATA_FLAG_EM) ap->em_message_type = hpriv->em_msg_type; + rc = ahci_setup_port_privdata(ap); + if (rc) + return rc; /* disabled/not-implemented port */ if (!(hpriv->port_map & (1 << i))) diff -Nur linux-4.2/drivers/ata/ahci.h linux-4.2-pck/drivers/ata/ahci.h --- linux-4.2/drivers/ata/ahci.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/ahci.h 2015-09-08 00:48:22.218364355 -0300 @@ -315,6 +315,12 @@ /* enclosure management info per PM slot */ struct ahci_em_priv em_priv[EM_MAX_SLOTS]; char *irq_desc; /* desc in /proc/interrupts */ + bool init_alpe; /* alpe enabled by default */ + bool init_asp; /* asp enabled by default */ + bool init_devslp; /* devslp enabled by default */ + u32 init_dito; /* initial dito configuration */ + u32 init_deto; /* initial deto configuration */ + u32 init_mdat; /* initial mdat configuration */ }; struct ahci_host_priv { @@ -374,6 +380,7 @@ extern struct ata_port_operations ahci_pmp_retry_srst_ops; unsigned int ahci_dev_classify(struct ata_port *ap); +int ahci_setup_port_privdata(struct ata_port *ap); void ahci_fill_cmd_slot(struct ahci_port_priv *pp, unsigned int tag, u32 opts); void ahci_save_initial_config(struct device *dev, diff -Nur linux-4.2/drivers/ata/libahci.c linux-4.2-pck/drivers/ata/libahci.c --- linux-4.2/drivers/ata/libahci.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libahci.c 2015-09-08 00:48:22.218364355 -0300 @@ -684,6 +684,7 @@ { struct ata_port *ap = link->ap; struct ahci_host_priv *hpriv = ap->host->private_data; + struct ahci_port_priv *ppriv = ap->private_data; struct ahci_port_priv *pp = ap->private_data; void __iomem *port_mmio = ahci_port_base(ap); @@ -701,9 +702,9 @@ if (hpriv->cap & HOST_CAP_ALPM) { u32 cmd = readl(port_mmio + PORT_CMD); + cmd &= ~(PORT_CMD_ASP | PORT_CMD_ALPE); if (policy == ATA_LPM_MAX_POWER || !(hints & ATA_LPM_HIPM)) { - cmd &= ~(PORT_CMD_ASP | PORT_CMD_ALPE); cmd |= PORT_CMD_ICC_ACTIVE; writel(cmd, port_mmio + PORT_CMD); @@ -711,6 +712,13 @@ /* wait 10ms to be sure we've come out of LPM state */ ata_msleep(ap, 10); + } else if (policy == ATA_LPM_FIRMWARE_DEFAULTS) { + if (ppriv->init_alpe) + cmd |= PORT_CMD_ALPE; + if (ppriv->init_asp) + cmd |= PORT_CMD_ASP; + + writel(cmd, port_mmio + PORT_CMD); } else { cmd |= PORT_CMD_ALPE; if (policy == ATA_LPM_MIN_POWER) @@ -725,10 +733,17 @@ if ((hpriv->cap2 & HOST_CAP2_SDS) && (hpriv->cap2 & HOST_CAP2_SADM) && (link->device->flags & ATA_DFLAG_DEVSLP)) { - if (policy == ATA_LPM_MIN_POWER) + switch (policy) { + case ATA_LPM_MIN_POWER: ahci_set_aggressive_devslp(ap, true); - else + break; + case ATA_LPM_FIRMWARE_DEFAULTS: + ahci_set_aggressive_devslp(ap, ppriv->init_devslp); + break; + default: ahci_set_aggressive_devslp(ap, false); + break; + } } if (policy == ATA_LPM_MAX_POWER) { @@ -2040,6 +2055,7 @@ static void ahci_set_aggressive_devslp(struct ata_port *ap, bool sleep) { struct ahci_host_priv *hpriv = ap->host->private_data; + struct ahci_port_priv *ppriv = ap->private_data; void __iomem *port_mmio = ahci_port_base(ap); struct ata_device *dev = ap->link.device; u32 devslp, dm, dito, mdat, deto; @@ -2075,26 +2091,32 @@ if (rc) return; - dm = (devslp & PORT_DEVSLP_DM_MASK) >> PORT_DEVSLP_DM_OFFSET; - dito = devslp_idle_timeout / (dm + 1); - if (dito > 0x3ff) - dito = 0x3ff; - - /* Use the nominal value 10 ms if the read MDAT is zero, - * the nominal value of DETO is 20 ms. - */ - if (dev->devslp_timing[ATA_LOG_DEVSLP_VALID] & - ATA_LOG_DEVSLP_VALID_MASK) { - mdat = dev->devslp_timing[ATA_LOG_DEVSLP_MDAT] & - ATA_LOG_DEVSLP_MDAT_MASK; - if (!mdat) + if (ppriv->init_devslp) { + dito = ppriv->init_dito; + deto = ppriv->init_deto; + mdat = ppriv->init_mdat; + } else { + dm = (devslp & PORT_DEVSLP_DM_MASK) >> PORT_DEVSLP_DM_OFFSET; + dito = devslp_idle_timeout / (dm + 1); + if (dito > 0x3ff) + dito = 0x3ff; + + /* Use the nominal value 10 ms if the read MDAT is zero, + * the nominal value of DETO is 20 ms. + */ + if (dev->devslp_timing[ATA_LOG_DEVSLP_VALID] & + ATA_LOG_DEVSLP_VALID_MASK) { + mdat = dev->devslp_timing[ATA_LOG_DEVSLP_MDAT] & + ATA_LOG_DEVSLP_MDAT_MASK; + if (!mdat) + mdat = 10; + deto = dev->devslp_timing[ATA_LOG_DEVSLP_DETO]; + if (!deto) + deto = 20; + } else { mdat = 10; - deto = dev->devslp_timing[ATA_LOG_DEVSLP_DETO]; - if (!deto) deto = 20; - } else { - mdat = 10; - deto = 20; + } } devslp |= ((dito << PORT_DEVSLP_DITO_OFFSET) | @@ -2257,19 +2279,53 @@ } #endif +/* + * Allocate port privdata and read back initial power management configuration + */ +int ahci_setup_port_privdata(struct ata_port *ap) +{ + struct ahci_port_priv *pp; + u32 cmd, devslp; + void __iomem *port_mmio = ahci_port_base(ap); + + pp = kzalloc(sizeof(*pp), GFP_KERNEL); + if (!pp) + return -ENOMEM; + + ap->private_data = pp; + + cmd = readl(port_mmio + PORT_CMD); + + if (cmd & PORT_CMD_ALPE) + pp->init_alpe = true; + + if (cmd & PORT_CMD_ASP) + pp->init_asp = true; + + devslp = readl(port_mmio + PORT_DEVSLP); + + /* devslp unsupported or disabled */ + if (!(devslp & PORT_DEVSLP_DSP) || !(devslp & PORT_DEVSLP_ADSE)) + return 0; + + pp->init_devslp = true; + pp->init_dito = (devslp >> PORT_DEVSLP_DITO_OFFSET) & 0x3ff; + pp->init_deto = (devslp >> PORT_DEVSLP_DETO_OFFSET) & 0xff; + pp->init_mdat = (devslp >> PORT_DEVSLP_MDAT_OFFSET) & 0x1f; + + return 0; +} +EXPORT_SYMBOL_GPL(ahci_setup_port_privdata); + static int ahci_port_start(struct ata_port *ap) { struct ahci_host_priv *hpriv = ap->host->private_data; + struct ahci_port_priv *pp = ap->private_data; struct device *dev = ap->host->dev; - struct ahci_port_priv *pp; void *mem; dma_addr_t mem_dma; size_t dma_sz, rx_fis_sz; - pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL); - if (!pp) - return -ENOMEM; - if (ap->host->n_ports > 1) { pp->irq_desc = devm_kzalloc(dev, 8, GFP_KERNEL); if (!pp->irq_desc) { @@ -2348,8 +2404,6 @@ ap->lock = &pp->lock; } - ap->private_data = pp; - /* engage engines, captain */ return ahci_port_resume(ap); } diff -Nur linux-4.2/drivers/ata/libahci_platform.c linux-4.2-pck/drivers/ata/libahci_platform.c --- linux-4.2/drivers/ata/libahci_platform.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libahci_platform.c 2015-09-08 00:48:22.218364355 -0300 @@ -565,6 +565,10 @@ if (ap->flags & ATA_FLAG_EM) ap->em_message_type = hpriv->em_msg_type; + rc = ahci_setup_port_privdata(ap); + if (rc) + return rc; + /* disabled/not-implemented port */ if (!(hpriv->port_map & (1 << i))) ap->ops = &ata_dummy_port_ops; diff -Nur linux-4.2/drivers/ata/libata-core.c linux-4.2-pck/drivers/ata/libata-core.c --- linux-4.2/drivers/ata/libata-core.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libata-core.c 2015-09-08 00:48:22.221697530 -0300 @@ -2024,6 +2024,9 @@ } } + if (id[79] & (1 << SATA_DIPM)) + dev->init_dipm = true; + *p_class = class; return 0; @@ -3655,6 +3658,11 @@ return rc; switch (policy) { + case ATA_LPM_FIRMWARE_DEFAULTS: + /* use the values we read at probe */ + scontrol &= ~(0x7 << 8); + scontrol |= (link->init_lpm << 8); + break; case ATA_LPM_MAX_POWER: /* disable all LPM transitions */ scontrol |= (0x7 << 8); @@ -5582,11 +5590,11 @@ } /** - * sata_link_init_spd - Initialize link->sata_spd_limit - * @link: Link to configure sata_spd_limit for + * sata_link_init_config - Initialize link->sata_spd_limit and init_lpm + * @link: Link to configure sata_spd_limit and init_lpm for * - * Initialize @link->[hw_]sata_spd_limit to the currently - * configured value. + * Initialize @link->[hw_]sata_spd_limit and @link->init_lpm to the + * currently configured value. * * LOCKING: * Kernel thread context (may sleep). @@ -5594,7 +5602,7 @@ * RETURNS: * 0 on success, -errno on failure. */ -int sata_link_init_spd(struct ata_link *link) +int sata_link_init_config(struct ata_link *link) { u8 spd; int rc; @@ -5611,6 +5619,8 @@ link->sata_spd_limit = link->hw_sata_spd_limit; + link->init_lpm = (link->saved_scontrol >> 8) & 0x7; + return 0; } @@ -6160,9 +6170,9 @@ ap->cbl = ATA_CBL_SATA; /* init sata_spd_limit to the current value */ - sata_link_init_spd(&ap->link); + sata_link_init_config(&ap->link); if (ap->slave_link) - sata_link_init_spd(ap->slave_link); + sata_link_init_config(ap->slave_link); /* print per-port info to dmesg */ xfer_mask = ata_pack_xfermask(ap->pio_mask, ap->mwdma_mask, diff -Nur linux-4.2/drivers/ata/libata-eh.c linux-4.2-pck/drivers/ata/libata-eh.c --- linux-4.2/drivers/ata/libata-eh.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libata-eh.c 2015-09-08 00:48:22.221697530 -0300 @@ -3429,9 +3429,9 @@ return 0; /* - * DIPM is enabled only for MIN_POWER as some devices - * misbehave when the host NACKs transition to SLUMBER. Order - * device and link configurations such that the host always + * DIPM is enabled only for MIN_POWER and FIRMWARE_DEFAULT as some + * devices misbehave when the host NACKs transition to SLUMBER. + * Order device and link configurations such that the host always * allows DIPM requests. */ ata_for_each_dev(dev, link, ENABLED) { @@ -3491,10 +3491,13 @@ if (ap && ap->slave_link) ap->slave_link->lpm_policy = policy; - /* host config updated, enable DIPM if transitioning to MIN_POWER */ + /* host config updated, enable DIPM if transitioning to MIN_POWER or + * FIRMWARE_DEFAULT when enabled by firmware + */ ata_for_each_dev(dev, link, ENABLED) { - if (policy == ATA_LPM_MIN_POWER && !no_dipm && - ata_id_has_dipm(dev->id)) { + if ((policy == ATA_LPM_MIN_POWER && !no_dipm && + ata_id_has_dipm(dev->id)) || + (policy == ATA_LPM_FIRMWARE_DEFAULTS && dev->init_dipm)) { err_mask = ata_dev_set_feature(dev, SETFEATURES_SATA_ENABLE, SATA_DIPM); if (err_mask && err_mask != AC_ERR_DEV) { diff -Nur linux-4.2/drivers/ata/libata-pmp.c linux-4.2-pck/drivers/ata/libata-pmp.c --- linux-4.2/drivers/ata/libata-pmp.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libata-pmp.c 2015-09-08 00:48:22.221697530 -0300 @@ -538,7 +538,7 @@ ap->ops->pmp_attach(ap); ata_for_each_link(tlink, ap, EDGE) - sata_link_init_spd(tlink); + sata_link_init_config(tlink); return 0; diff -Nur linux-4.2/drivers/ata/libata-scsi.c linux-4.2-pck/drivers/ata/libata-scsi.c --- linux-4.2/drivers/ata/libata-scsi.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libata-scsi.c 2015-09-08 00:48:22.221697530 -0300 @@ -107,6 +107,7 @@ static const char *ata_lpm_policy_names[] = { [ATA_LPM_UNKNOWN] = "max_performance", [ATA_LPM_MAX_POWER] = "max_performance", + [ATA_LPM_FIRMWARE_DEFAULTS] = "firmware_defaults", [ATA_LPM_MED_POWER] = "medium_power", [ATA_LPM_MIN_POWER] = "min_power", }; diff -Nur linux-4.2/drivers/ata/libata.h linux-4.2-pck/drivers/ata/libata.h --- linux-4.2/drivers/ata/libata.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/libata.h 2015-09-08 00:48:22.221697530 -0300 @@ -98,7 +98,7 @@ extern bool ata_phys_link_offline(struct ata_link *link); extern void ata_dev_init(struct ata_device *dev); extern void ata_link_init(struct ata_port *ap, struct ata_link *link, int pmp); -extern int sata_link_init_spd(struct ata_link *link); +extern int sata_link_init_config(struct ata_link *link); extern int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg); extern int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg); extern struct ata_port *ata_port_alloc(struct ata_host *host); diff -Nur linux-4.2/drivers/ata/sata_highbank.c linux-4.2-pck/drivers/ata/sata_highbank.c --- linux-4.2/drivers/ata/sata_highbank.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/ata/sata_highbank.c 2015-09-08 00:48:22.221697530 -0300 @@ -557,6 +557,10 @@ if (ap->flags & ATA_FLAG_EM) ap->em_message_type = hpriv->em_msg_type; + rc = ahci_setup_port_privdata(ap); + if (rc) + goto err0; + /* disabled/not-implemented port */ if (!(hpriv->port_map & (1 << i))) ap->ops = &ata_dummy_port_ops; diff -Nur linux-4.2/drivers/cpufreq/cpufreq_ondemand.c linux-4.2-pck/drivers/cpufreq/cpufreq_ondemand.c --- linux-4.2/drivers/cpufreq/cpufreq_ondemand.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/cpufreq/cpufreq_ondemand.c 2015-09-08 00:48:22.221697530 -0300 @@ -20,7 +20,11 @@ /* On-demand governor macros */ #define DEF_FREQUENCY_UP_THRESHOLD (80) +#ifdef CONFIG_PCK_INTERACTIVE +#define DEF_SAMPLING_DOWN_FACTOR (10) +#else #define DEF_SAMPLING_DOWN_FACTOR (1) +#endif #define MAX_SAMPLING_DOWN_FACTOR (100000) #define MICRO_FREQUENCY_UP_THRESHOLD (95) #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) diff -Nur linux-4.2/drivers/gpu/drm/drm_gem.c linux-4.2-pck/drivers/gpu/drm/drm_gem.c --- linux-4.2/drivers/gpu/drm/drm_gem.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/gpu/drm/drm_gem.c 2015-09-08 11:20:32.040366141 -0300 @@ -137,7 +137,7 @@ drm_gem_private_object_init(dev, obj, size); - filp = shmem_file_setup("drm mm object", size, VM_NORESERVE); + filp = shmem_file_setup("drm mm object", size, VM_NORESERVE, 1); if (IS_ERR(filp)) return PTR_ERR(filp); diff -Nur linux-4.2/drivers/gpu/drm/radeon/r600.c linux-4.2-pck/drivers/gpu/drm/radeon/r600.c --- linux-4.2/drivers/gpu/drm/radeon/r600.c 2015-08-30 16:21:13.000000000 -0300 +++ linux-4.2-pck/drivers/gpu/drm/radeon/r600.c 2015-09-08 00:49:30.085144720 -0300 @@ -2489,7 +2489,7 @@ } DRM_INFO("Loading %s Microcode\n", chip_name); - +#if 0 snprintf(fw_name, sizeof(fw_name), "/*(DEBLOBBED)*/", chip_name); err = reject_firmware(&rdev->pfp_fw, fw_name, rdev->dev); if (err) @@ -2541,7 +2541,7 @@ err = -EINVAL; } } - +#endif out: if (err) { if (err != -EINVAL) @@ -3201,7 +3201,7 @@ r = radeon_bo_init(rdev); if (r) return r; - +#if 0 if (!rdev->me_fw || !rdev->pfp_fw || !rdev->rlc_fw) { r = r600_init_microcode(rdev); if (r) { @@ -3209,7 +3209,7 @@ return r; } } - +#endif /* Initialize power management */ radeon_pm_init(rdev); diff -Nur linux-4.2/drivers/gpu/drm/radeon/r600_cp.c linux-4.2-pck/drivers/gpu/drm/radeon/r600_cp.c --- linux-4.2/drivers/gpu/drm/radeon/r600_cp.c 2015-08-30 16:21:17.000000000 -0300 +++ linux-4.2-pck/drivers/gpu/drm/radeon/r600_cp.c 2015-09-08 00:49:30.085144720 -0300 @@ -2241,7 +2241,7 @@ else r600_vm_init(dev); } - +#if 0 if (!dev_priv->me_fw || !dev_priv->pfp_fw) { int err = r600_cp_init_microcode(dev_priv); if (err) { @@ -2250,6 +2250,9 @@ return err; } } +#endif + printk("Skipping firmware loading\n"); +#endif if (((dev_priv->flags & RADEON_FAMILY_MASK) >= CHIP_RV770)) r700_cp_load_microcode(dev_priv); else diff -Nur linux-4.2/drivers/gpu/drm/ttm/ttm_tt.c linux-4.2-pck/drivers/gpu/drm/ttm/ttm_tt.c --- linux-4.2/drivers/gpu/drm/ttm/ttm_tt.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/gpu/drm/ttm/ttm_tt.c 2015-09-08 11:20:32.080364185 -0300 @@ -339,7 +339,7 @@ if (!persistent_swap_storage) { swap_storage = shmem_file_setup("ttm swap", ttm->num_pages << PAGE_SHIFT, - 0); + 0, 0); if (unlikely(IS_ERR(swap_storage))) { pr_err("Failed allocating swap storage\n"); return PTR_ERR(swap_storage); diff -Nur linux-4.2/drivers/input/joydev.c linux-4.2-pck/drivers/input/joydev.c --- linux-4.2/drivers/input/joydev.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/input/joydev.c 2015-09-08 00:48:22.221697530 -0300 @@ -28,15 +28,21 @@ #include #include #include +#include +#include MODULE_AUTHOR("Vojtech Pavlik "); MODULE_DESCRIPTION("Joystick device interfaces"); MODULE_SUPPORTED_DEVICE("input/js"); MODULE_LICENSE("GPL"); - #define JOYDEV_MINOR_BASE 0 #define JOYDEV_MINORS 16 #define JOYDEV_BUFFER_SIZE 64 +#define MAX_REMAP_SIZE 10 + +static int remap_array[MAX_REMAP_SIZE]; +static int remap_count = 0; +static int free_buttons[MAX_REMAP_SIZE]; struct joydev { int open; @@ -71,6 +77,9 @@ struct list_head node; }; +module_param_array(remap_array, int, &remap_count, 0 ); +MODULE_PARM_DESC( remap_array, "remap axis to buttons\n" ); + static int joydev_correct(int value, struct js_corr *corr) { switch (corr->type) { @@ -121,6 +130,17 @@ struct joydev *joydev = handle->private; struct joydev_client *client; struct js_event event; + int i; + + if( remap_count > 0 && remap_count < MAX_REMAP_SIZE ){ + for( i = 0; i < remap_count; i++ ) + if( code == remap_array[i] ){ + type = EV_KEY; + code = free_buttons[i]; + if( value == 255 ) + value = 1; + } + } switch (type) { @@ -826,7 +846,7 @@ const struct input_device_id *id) { struct joydev *joydev; - int i, j, t, minor, dev_no; + int i, j = 0, t, minor, dev_no; int error; minor = input_get_new_minor(JOYDEV_MINOR_BASE, JOYDEV_MINORS, true); @@ -871,15 +891,24 @@ joydev->keymap[i] = joydev->nkey; joydev->keypam[joydev->nkey] = i + BTN_MISC; joydev->nkey++; - } + j = i; + } + if( remap_count > 0 && remap_count < MAX_REMAP_SIZE ){ + printk( "[joydev] axis remapping enabled\n" ); + for( i = 0; i < remap_count; i++ ){ + joydev->keymap[j + i + 1] = joydev->nkey; + joydev->keypam[joydev->nkey] = i + j + 1 + BTN_MISC; + free_buttons[i] = j + i + 1 + BTN_MISC; + joydev->nkey++; + } for (i = 0; i < BTN_JOYSTICK - BTN_MISC; i++) if (test_bit(i + BTN_MISC, dev->keybit)) { joydev->keymap[i] = joydev->nkey; joydev->keypam[joydev->nkey] = i + BTN_MISC; joydev->nkey++; } - + } for (i = 0; i < joydev->nabs; i++) { j = joydev->abspam[i]; if (input_abs_get_max(dev, j) == input_abs_get_min(dev, j)) { diff -Nur linux-4.2/drivers/input/mouse/synaptics.c linux-4.2-pck/drivers/input/mouse/synaptics.c --- linux-4.2/drivers/input/mouse/synaptics.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/input/mouse/synaptics.c 2015-09-08 00:48:22.221697530 -0300 @@ -1250,7 +1250,9 @@ /* Clickpads report only left button */ __clear_bit(BTN_RIGHT, dev->keybit); __clear_bit(BTN_MIDDLE, dev->keybit); - } + } else if (SYN_CAP_CLICKPAD2BTN(priv->ext_cap_0c) || + SYN_CAP_CLICKPAD2BTN2(priv->ext_cap_0c)) + __set_bit(INPUT_PROP_BUTTONPAD, dev->propbit); } static ssize_t synaptics_show_disable_gesture(struct psmouse *psmouse, diff -Nur linux-4.2/drivers/input/mouse/synaptics.h linux-4.2-pck/drivers/input/mouse/synaptics.h --- linux-4.2/drivers/input/mouse/synaptics.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/input/mouse/synaptics.h 2015-09-08 00:48:22.221697530 -0300 @@ -85,6 +85,7 @@ */ #define SYN_CAP_CLICKPAD(ex0c) ((ex0c) & 0x100000) /* 1-button ClickPad */ #define SYN_CAP_CLICKPAD2BTN(ex0c) ((ex0c) & 0x000100) /* 2-button ClickPad */ +#define SYN_CAP_CLICKPAD2BTN2(ex0c) ((ex0c) & 0x200000) /* 2-button ClickPad */ #define SYN_CAP_MAX_DIMENSIONS(ex0c) ((ex0c) & 0x020000) #define SYN_CAP_MIN_DIMENSIONS(ex0c) ((ex0c) & 0x002000) #define SYN_CAP_ADV_GESTURE(ex0c) ((ex0c) & 0x080000) diff -Nur linux-4.2/drivers/macintosh/Kconfig linux-4.2-pck/drivers/macintosh/Kconfig --- linux-4.2/drivers/macintosh/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/macintosh/Kconfig 2015-09-08 00:48:22.221697530 -0300 @@ -172,6 +172,13 @@ If unsure, say Y. +config ADB_TRACKPAD_ABSOLUTE + bool "Enable absolute mode for adb trackpads" + depends on INPUT_ADBHID + help + Enable absolute mode in adb-base trackpads. This feature adds + compatibility with synaptics Xorg / Xfree drivers. + config MAC_EMUMOUSEBTN tristate "Support for mouse button 2+3 emulation" depends on SYSCTL && INPUT diff -Nur linux-4.2/drivers/macintosh/adbhid.c linux-4.2-pck/drivers/macintosh/adbhid.c --- linux-4.2/drivers/macintosh/adbhid.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/macintosh/adbhid.c 2015-09-08 00:48:22.221697530 -0300 @@ -261,6 +261,15 @@ #define ADBMOUSE_MS_A3 8 /* Mouse systems A3 trackball (handler 3) */ #define ADBMOUSE_MACALLY2 9 /* MacAlly 2-button mouse */ +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE +#define ABS_XMIN 310 +#define ABS_XMAX 1700 +#define ABS_YMIN 200 +#define ABS_YMAX 1000 +#define ABS_ZMIN 0 +#define ABS_ZMAX 55 +#endif + static void adbhid_keyboard_input(unsigned char *data, int nb, int apoll) { @@ -405,6 +414,9 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) { int id = (data[0] >> 4) & 0x0f; +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + int btn = 0; int x_axis = 0; int y_axis = 0; int z_axis = 0; +#endif if (!adbhid[id]) { printk(KERN_ERR "ADB HID on ID %d not yet registered\n", id); @@ -436,6 +448,17 @@ high bits of y-axis motion. XY is additional high bits of x-axis motion. + For ADB Absolute motion protocol the data array will contain the + following values: + + BITS COMMENTS + data[0] = dddd 1100 ADB command: Talk, register 0, for device dddd. + data[1] = byyy yyyy Left button and y-axis motion. + data[2] = bxxx xxxx Second button and x-axis motion. + data[3] = 1yyy 1xxx Half bits of y-axis and x-axis motion. + data[4] = 1yyy 1xxx Higher bits of y-axis and x-axis motion. + data[5] = 1zzz 1zzz Higher and lower bits of z-pressure. + MacAlly 2-button mouse protocol. For MacAlly 2-button mouse protocol the data array will contain the @@ -458,8 +481,17 @@ switch (adbhid[id]->mouse_kind) { case ADBMOUSE_TRACKPAD: +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + x_axis = (data[2] & 0x7f) | ((data[3] & 0x07) << 7) | + ((data[4] & 0x07) << 10); + y_axis = (data[1] & 0x7f) | ((data[3] & 0x70) << 3) | + ((data[4] & 0x70) << 6); + z_axis = (data[5] & 0x07) | ((data[5] & 0x70) >> 1); + btn = (!(data[1] >> 7)) & 1; +#else data[1] = (data[1] & 0x7f) | ((data[1] & data[2]) & 0x80); data[2] = data[2] | 0x80; +#endif break; case ADBMOUSE_MICROSPEED: data[1] = (data[1] & 0x7f) | ((data[3] & 0x01) << 7); @@ -485,17 +517,39 @@ break; } - input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); - input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + if ( adbhid[id]->mouse_kind == ADBMOUSE_TRACKPAD ) { - if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) - input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); + if(z_axis > 30) input_report_key(adbhid[id]->input, BTN_TOUCH, 1); + if(z_axis < 25) input_report_key(adbhid[id]->input, BTN_TOUCH, 0); - input_report_rel(adbhid[id]->input, REL_X, - ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); - input_report_rel(adbhid[id]->input, REL_Y, - ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); + if(z_axis > 0){ + input_report_abs(adbhid[id]->input, ABS_X, x_axis); + input_report_abs(adbhid[id]->input, ABS_Y, y_axis); + input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 1); + input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 5); + } else { + input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 0); + input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 0); + } + + input_report_abs(adbhid[id]->input, ABS_PRESSURE, z_axis); + input_report_key(adbhid[id]->input, BTN_LEFT, btn); + } else { +#endif + input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); + input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); + + if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) + input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); + input_report_rel(adbhid[id]->input, REL_X, + ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); + input_report_rel(adbhid[id]->input, REL_Y, + ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + } +#endif input_sync(adbhid[id]->input); } @@ -849,6 +903,15 @@ input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + set_bit(EV_ABS, input_dev->evbit); + input_set_abs_params(input_dev, ABS_X, ABS_XMIN, ABS_XMAX, 0, 0); + input_set_abs_params(input_dev, ABS_Y, ABS_YMIN, ABS_YMAX, 0, 0); + input_set_abs_params(input_dev, ABS_PRESSURE, ABS_ZMIN, ABS_ZMAX, 0, 0); + set_bit(BTN_TOUCH, input_dev->keybit); + set_bit(BTN_TOOL_FINGER, input_dev->keybit); + set_bit(ABS_TOOL_WIDTH, input_dev->absbit); +#endif break; case ADB_MISC: @@ -1132,7 +1195,11 @@ r1_buffer[3], r1_buffer[4], r1_buffer[5], +#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE + 0x00, /* Enable absolute mode */ +#else 0x03, /*r1_buffer[6],*/ +#endif r1_buffer[7]); /* Without this flush, the trackpad may be locked up */ diff -Nur linux-4.2/drivers/net/ethernet/intel/e1000e/netdev.c linux-4.2-pck/drivers/net/ethernet/intel/e1000e/netdev.c --- linux-4.2/drivers/net/ethernet/intel/e1000e/netdev.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/net/ethernet/intel/e1000e/netdev.c 2015-09-08 00:48:22.225030705 -0300 @@ -4280,18 +4280,29 @@ struct e1000_adapter *adapter = container_of(cc, struct e1000_adapter, cc); struct e1000_hw *hw = &adapter->hw; + u32 systimel_1, systimel_2, systimeh; cycle_t systim, systim_next; - /* SYSTIMH latching upon SYSTIML read does not work well. To fix that - * we don't want to allow overflow of SYSTIML and a change to SYSTIMH - * to occur between reads, so if we read a vale close to overflow, we - * wait for overflow to occur and read both registers when its safe. + /* SYSTIMH latching upon SYSTIML read does not work well. + * This means that if SYSTIML overflows after we read it but before + * we read SYSTIMH, the value of SYSTIMH has been incremented and we + * will experience a huge non linear increment in the systime value + * to fix that we test for overflow and if true, we re-read systime. */ - u32 systim_overflow_latch_fix = 0x3FFFFFFF; - - do { - systim = (cycle_t)er32(SYSTIML); - } while (systim > systim_overflow_latch_fix); - systim |= (cycle_t)er32(SYSTIMH) << 32; + systimel_1 = er32(SYSTIML); + systimeh = er32(SYSTIMH); + systimel_2 = er32(SYSTIML); + /* Check for overflow. If there was no overflow, use the values */ + if (systimel_1 < systimel_2) { + systim = (cycle_t)systimel_1; + systim |= (cycle_t)systimeh << 32; + } else { + /* There was an overflow, read again SYSTIMH, and use + * systimel_2 + */ + systimeh = er32(SYSTIMH); + systim = (cycle_t)systimel_2; + systim |= (cycle_t)systimeh << 32; + } if ((hw->mac.type == e1000_82574) || (hw->mac.type == e1000_82583)) { u64 incvalue, time_delta, rem, temp; diff -Nur linux-4.2/drivers/net/netconsole.c linux-4.2-pck/drivers/net/netconsole.c --- linux-4.2/drivers/net/netconsole.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/net/netconsole.c 2015-09-08 00:48:22.225030705 -0300 @@ -836,7 +836,7 @@ } static void write_ext_msg(struct console *con, const char *msg, - unsigned int len) + unsigned int len, unsigned int loglevel) { struct netconsole_target *nt; unsigned long flags; @@ -851,7 +851,8 @@ spin_unlock_irqrestore(&target_list_lock, flags); } -static void write_msg(struct console *con, const char *msg, unsigned int len) +static void write_msg(struct console *con, const char *msg, unsigned int len, + unsigned int loglevel) { int frag, left; unsigned long flags; diff -Nur linux-4.2/drivers/platform/x86/Kconfig linux-4.2-pck/drivers/platform/x86/Kconfig --- linux-4.2/drivers/platform/x86/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/platform/x86/Kconfig 2015-09-08 00:48:22.225030705 -0300 @@ -489,9 +489,30 @@ If you are not sure, say Y here. The driver enables polling only if it is strictly necessary to do so. +config THINKPAD_EC + tristate + depends on X86 + ---help--- + This is a low-level driver for accessing the ThinkPad H8S embedded + controller over the LPC bus (not to be confused with the ACPI Embedded + Controller interface). + +config TP_SMAPI + tristate "ThinkPad SMAPI Support" + depends on X86 + select THINKPAD_EC + default n + help + This adds SMAPI support on Lenovo/IBM ThinkPads, for features such + as battery charging control. For more information about this driver + see . + + If you have a Lenovo/IBM ThinkPad laptop, say Y or M here. + config SENSORS_HDAPS tristate "Thinkpad Hard Drive Active Protection System (hdaps)" depends on INPUT && X86 + select THINKPAD_EC select INPUT_POLLDEV default n help diff -Nur linux-4.2/drivers/platform/x86/Makefile linux-4.2-pck/drivers/platform/x86/Makefile --- linux-4.2/drivers/platform/x86/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/platform/x86/Makefile 2015-09-08 00:48:22.225030705 -0300 @@ -25,6 +25,8 @@ obj-$(CONFIG_SONY_LAPTOP) += sony-laptop.o obj-$(CONFIG_IDEAPAD_LAPTOP) += ideapad-laptop.o obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o +obj-$(CONFIG_THINKPAD_EC) += thinkpad_ec.o +obj-$(CONFIG_TP_SMAPI) += tp_smapi.o obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o obj-$(CONFIG_FUJITSU_LAPTOP) += fujitsu-laptop.o obj-$(CONFIG_FUJITSU_TABLET) += fujitsu-tablet.o diff -Nur linux-4.2/drivers/platform/x86/hdaps.c linux-4.2-pck/drivers/platform/x86/hdaps.c --- linux-4.2/drivers/platform/x86/hdaps.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/platform/x86/hdaps.c 2015-09-08 00:48:22.225030705 -0300 @@ -30,266 +30,384 @@ #include #include -#include +#include #include -#include #include #include #include #include -#include - -#define HDAPS_LOW_PORT 0x1600 /* first port used by hdaps */ -#define HDAPS_NR_PORTS 0x30 /* number of ports: 0x1600 - 0x162f */ - -#define HDAPS_PORT_STATE 0x1611 /* device state */ -#define HDAPS_PORT_YPOS 0x1612 /* y-axis position */ -#define HDAPS_PORT_XPOS 0x1614 /* x-axis position */ -#define HDAPS_PORT_TEMP1 0x1616 /* device temperature, in Celsius */ -#define HDAPS_PORT_YVAR 0x1617 /* y-axis variance (what is this?) */ -#define HDAPS_PORT_XVAR 0x1619 /* x-axis variance (what is this?) */ -#define HDAPS_PORT_TEMP2 0x161b /* device temperature (again?) */ -#define HDAPS_PORT_UNKNOWN 0x161c /* what is this? */ -#define HDAPS_PORT_KMACT 0x161d /* keyboard or mouse activity */ - -#define STATE_FRESH 0x50 /* accelerometer data is fresh */ +#include +#include +#include + +/* Embedded controller accelerometer read command and its result: */ +static const struct thinkpad_ec_row ec_accel_args = + { .mask = 0x0001, .val = {0x11} }; +#define EC_ACCEL_IDX_READOUTS 0x1 /* readouts included in this read */ + /* First readout, if READOUTS>=1: */ +#define EC_ACCEL_IDX_YPOS1 0x2 /* y-axis position word */ +#define EC_ACCEL_IDX_XPOS1 0x4 /* x-axis position word */ +#define EC_ACCEL_IDX_TEMP1 0x6 /* device temperature in Celsius */ + /* Second readout, if READOUTS>=2: */ +#define EC_ACCEL_IDX_XPOS2 0x7 /* y-axis position word */ +#define EC_ACCEL_IDX_YPOS2 0x9 /* x-axis position word */ +#define EC_ACCEL_IDX_TEMP2 0xb /* device temperature in Celsius */ +#define EC_ACCEL_IDX_QUEUED 0xc /* Number of queued readouts left */ +#define EC_ACCEL_IDX_KMACT 0xd /* keyboard or mouse activity */ +#define EC_ACCEL_IDX_RETVAL 0xf /* command return value, good=0x00 */ #define KEYBD_MASK 0x20 /* set if keyboard activity */ #define MOUSE_MASK 0x40 /* set if mouse activity */ -#define KEYBD_ISSET(n) (!! (n & KEYBD_MASK)) /* keyboard used? */ -#define MOUSE_ISSET(n) (!! (n & MOUSE_MASK)) /* mouse used? */ -#define INIT_TIMEOUT_MSECS 4000 /* wait up to 4s for device init ... */ -#define INIT_WAIT_MSECS 200 /* ... in 200ms increments */ +#define READ_TIMEOUT_MSECS 100 /* wait this long for device read */ +#define RETRY_MSECS 3 /* retry delay */ -#define HDAPS_POLL_INTERVAL 50 /* poll for input every 1/20s (50 ms)*/ #define HDAPS_INPUT_FUZZ 4 /* input event threshold */ #define HDAPS_INPUT_FLAT 4 +#define KMACT_REMEMBER_PERIOD (HZ/10) /* keyboard/mouse persistance */ -#define HDAPS_X_AXIS (1 << 0) -#define HDAPS_Y_AXIS (1 << 1) -#define HDAPS_BOTH_AXES (HDAPS_X_AXIS | HDAPS_Y_AXIS) +/* Input IDs */ +#define HDAPS_INPUT_VENDOR PCI_VENDOR_ID_IBM +#define HDAPS_INPUT_PRODUCT 0x5054 /* "TP", shared with thinkpad_acpi */ +#define HDAPS_INPUT_JS_VERSION 0x6801 /* Joystick emulation input device */ +#define HDAPS_INPUT_RAW_VERSION 0x4801 /* Raw accelerometer input device */ + +/* Axis orientation. */ +/* The unnatural bit-representation of inversions is for backward + * compatibility with the"invert=1" module parameter. */ +#define HDAPS_ORIENT_INVERT_XY 0x01 /* Invert both X and Y axes. */ +#define HDAPS_ORIENT_INVERT_X 0x02 /* Invert the X axis (uninvert if + * already inverted by INVERT_XY). */ +#define HDAPS_ORIENT_SWAP 0x04 /* Swap the axes. The swap occurs + * before inverting X or Y. */ +#define HDAPS_ORIENT_MAX 0x07 +#define HDAPS_ORIENT_UNDEFINED 0xFF /* Placeholder during initialization */ +#define HDAPS_ORIENT_INVERT_Y (HDAPS_ORIENT_INVERT_XY | HDAPS_ORIENT_INVERT_X) +static struct timer_list hdaps_timer; static struct platform_device *pdev; -static struct input_polled_dev *hdaps_idev; -static unsigned int hdaps_invert; -static u8 km_activity; -static int rest_x; -static int rest_y; - -static DEFINE_MUTEX(hdaps_mtx); - -/* - * __get_latch - Get the value from a given port. Callers must hold hdaps_mtx. - */ -static inline u8 __get_latch(u16 port) -{ - return inb(port) & 0xff; +static struct input_dev *hdaps_idev; /* joystick-like device with fuzz */ +static struct input_dev *hdaps_idev_raw; /* raw hdaps sensor readouts */ +static unsigned int hdaps_invert = HDAPS_ORIENT_UNDEFINED; +static int needs_calibration; + +/* Configuration: */ +static int sampling_rate = 50; /* Sampling rate */ +static int oversampling_ratio = 5; /* Ratio between our sampling rate and + * EC accelerometer sampling rate */ +static int running_avg_filter_order = 2; /* EC running average filter order */ + +/* Latest state readout: */ +static int pos_x, pos_y; /* position */ +static int temperature; /* temperature */ +static int stale_readout = 1; /* last read invalid */ +static int rest_x, rest_y; /* calibrated rest position */ + +/* Last time we saw keyboard and mouse activity: */ +static u64 last_keyboard_jiffies = INITIAL_JIFFIES; +static u64 last_mouse_jiffies = INITIAL_JIFFIES; +static u64 last_update_jiffies = INITIAL_JIFFIES; + +/* input device use count */ +static int hdaps_users; +static DEFINE_MUTEX(hdaps_users_mtx); + +/* Some models require an axis transformation to the standard representation */ +static void transform_axes(int *x, int *y) +{ + if (hdaps_invert & HDAPS_ORIENT_SWAP) { + int z; + z = *x; + *x = *y; + *y = z; + } + if (hdaps_invert & HDAPS_ORIENT_INVERT_XY) { + *x = -*x; + *y = -*y; + } + if (hdaps_invert & HDAPS_ORIENT_INVERT_X) + *x = -*x; } -/* - * __check_latch - Check a port latch for a given value. Returns zero if the - * port contains the given value. Callers must hold hdaps_mtx. +/** + * __hdaps_update - query current state, with locks already acquired + * @fast: if nonzero, do one quick attempt without retries. + * + * Query current accelerometer state and update global state variables. + * Also prefetches the next query. Caller must hold controller lock. */ -static inline int __check_latch(u16 port, u8 val) +static int __hdaps_update(int fast) { - if (__get_latch(port) == val) - return 0; - return -EINVAL; -} + /* Read data: */ + struct thinkpad_ec_row data; + int ret; -/* - * __wait_latch - Wait up to 100us for a port latch to get a certain value, - * returning zero if the value is obtained. Callers must hold hdaps_mtx. - */ -static int __wait_latch(u16 port, u8 val) -{ - unsigned int i; + data.mask = (1 << EC_ACCEL_IDX_READOUTS) | (1 << EC_ACCEL_IDX_KMACT) | + (3 << EC_ACCEL_IDX_YPOS1) | (3 << EC_ACCEL_IDX_XPOS1) | + (1 << EC_ACCEL_IDX_TEMP1) | (1 << EC_ACCEL_IDX_RETVAL); + if (fast) + ret = thinkpad_ec_try_read_row(&ec_accel_args, &data); + else + ret = thinkpad_ec_read_row(&ec_accel_args, &data); + thinkpad_ec_prefetch_row(&ec_accel_args); /* Prefetch even if error */ + if (ret) + return ret; - for (i = 0; i < 20; i++) { - if (!__check_latch(port, val)) - return 0; - udelay(5); + /* Check status: */ + if (data.val[EC_ACCEL_IDX_RETVAL] != 0x00) { + pr_warn("read RETVAL=0x%02x\n", + data.val[EC_ACCEL_IDX_RETVAL]); + return -EIO; } - return -EIO; + if (data.val[EC_ACCEL_IDX_READOUTS] < 1) + return -EBUSY; /* no pending readout, try again later */ + + /* Parse position data: */ + pos_x = *(s16 *)(data.val+EC_ACCEL_IDX_XPOS1); + pos_y = *(s16 *)(data.val+EC_ACCEL_IDX_YPOS1); + transform_axes(&pos_x, &pos_y); + + /* Keyboard and mouse activity status is cleared as soon as it's read, + * so applications will eat each other's events. Thus we remember any + * event for KMACT_REMEMBER_PERIOD jiffies. + */ + if (data.val[EC_ACCEL_IDX_KMACT] & KEYBD_MASK) + last_keyboard_jiffies = get_jiffies_64(); + if (data.val[EC_ACCEL_IDX_KMACT] & MOUSE_MASK) + last_mouse_jiffies = get_jiffies_64(); + + temperature = data.val[EC_ACCEL_IDX_TEMP1]; + + last_update_jiffies = get_jiffies_64(); + stale_readout = 0; + if (needs_calibration) { + rest_x = pos_x; + rest_y = pos_y; + needs_calibration = 0; + } + + return 0; } -/* - * __device_refresh - request a refresh from the accelerometer. Does not wait - * for refresh to complete. Callers must hold hdaps_mtx. +/** + * hdaps_update - acquire locks and query current state + * + * Query current accelerometer state and update global state variables. + * Also prefetches the next query. + * Retries until timeout if the accelerometer is not in ready status (common). + * Does its own locking. */ -static void __device_refresh(void) +static int hdaps_update(void) { - udelay(200); - if (inb(0x1604) != STATE_FRESH) { - outb(0x11, 0x1610); - outb(0x01, 0x161f); + u64 age = get_jiffies_64() - last_update_jiffies; + int total, ret; + + if (!stale_readout && age < (9*HZ)/(10*sampling_rate)) + return 0; /* already updated recently */ + for (total = 0; total < READ_TIMEOUT_MSECS; total += RETRY_MSECS) { + ret = thinkpad_ec_lock(); + if (ret) + return ret; + ret = __hdaps_update(0); + thinkpad_ec_unlock(); + + if (!ret) + return 0; + if (ret != -EBUSY) + break; + msleep(RETRY_MSECS); } + return ret; } -/* - * __device_refresh_sync - request a synchronous refresh from the - * accelerometer. We wait for the refresh to complete. Returns zero if - * successful and nonzero on error. Callers must hold hdaps_mtx. +/** + * hdaps_set_power - enable or disable power to the accelerometer. + * Returns zero on success and negative error code on failure. Can sleep. */ -static int __device_refresh_sync(void) +static int hdaps_set_power(int on) { - __device_refresh(); - return __wait_latch(0x1604, STATE_FRESH); + struct thinkpad_ec_row args = + { .mask = 0x0003, .val = {0x14, on?0x01:0x00} }; + struct thinkpad_ec_row data = { .mask = 0x8000 }; + int ret = thinkpad_ec_read_row(&args, &data); + if (ret) + return ret; + if (data.val[0xF] != 0x00) + return -EIO; + return 0; } -/* - * __device_complete - indicate to the accelerometer that we are done reading - * data, and then initiate an async refresh. Callers must hold hdaps_mtx. +/** + * hdaps_set_ec_config - set accelerometer parameters. + * @ec_rate: embedded controller sampling rate + * @order: embedded controller running average filter order + * (Normally we have @ec_rate = sampling_rate * oversampling_ratio.) + * Returns zero on success and negative error code on failure. Can sleep. */ -static inline void __device_complete(void) +static int hdaps_set_ec_config(int ec_rate, int order) { - inb(0x161f); - inb(0x1604); - __device_refresh(); + struct thinkpad_ec_row args = { .mask = 0x000F, + .val = {0x10, (u8)ec_rate, (u8)(ec_rate>>8), order} }; + struct thinkpad_ec_row data = { .mask = 0x8000 }; + int ret = thinkpad_ec_read_row(&args, &data); + pr_debug("setting ec_rate=%d, filter_order=%d\n", ec_rate, order); + if (ret) + return ret; + if (data.val[0xF] == 0x03) { + pr_warn("config param out of range\n"); + return -EINVAL; + } + if (data.val[0xF] == 0x06) { + pr_warn("config change already pending\n"); + return -EBUSY; + } + if (data.val[0xF] != 0x00) { + pr_warn("config change error, ret=%d\n", + data.val[0xF]); + return -EIO; + } + return 0; } -/* - * hdaps_readb_one - reads a byte from a single I/O port, placing the value in - * the given pointer. Returns zero on success or a negative error on failure. - * Can sleep. +/** + * hdaps_get_ec_config - get accelerometer parameters. + * @ec_rate: embedded controller sampling rate + * @order: embedded controller running average filter order + * Returns zero on success and negative error code on failure. Can sleep. */ -static int hdaps_readb_one(unsigned int port, u8 *val) +static int hdaps_get_ec_config(int *ec_rate, int *order) { - int ret; - - mutex_lock(&hdaps_mtx); - - /* do a sync refresh -- we need to be sure that we read fresh data */ - ret = __device_refresh_sync(); + const struct thinkpad_ec_row args = + { .mask = 0x0003, .val = {0x17, 0x82} }; + struct thinkpad_ec_row data = { .mask = 0x801F }; + int ret = thinkpad_ec_read_row(&args, &data); if (ret) - goto out; - - *val = inb(port); - __device_complete(); - -out: - mutex_unlock(&hdaps_mtx); - return ret; + return ret; + if (data.val[0xF] != 0x00) + return -EIO; + if (!(data.val[0x1] & 0x01)) + return -ENXIO; /* accelerometer polling not enabled */ + if (data.val[0x1] & 0x02) + return -EBUSY; /* config change in progress, retry later */ + *ec_rate = data.val[0x2] | ((int)(data.val[0x3]) << 8); + *order = data.val[0x4]; + return 0; } -/* __hdaps_read_pair - internal lockless helper for hdaps_read_pair(). */ -static int __hdaps_read_pair(unsigned int port1, unsigned int port2, - int *x, int *y) +/** + * hdaps_get_ec_mode - get EC accelerometer mode + * Returns zero on success and negative error code on failure. Can sleep. + */ +static int hdaps_get_ec_mode(u8 *mode) { - /* do a sync refresh -- we need to be sure that we read fresh data */ - if (__device_refresh_sync()) + const struct thinkpad_ec_row args = + { .mask = 0x0001, .val = {0x13} }; + struct thinkpad_ec_row data = { .mask = 0x8002 }; + int ret = thinkpad_ec_read_row(&args, &data); + if (ret) + return ret; + if (data.val[0xF] != 0x00) { + pr_warn("accelerometer not implemented (0x%02x)\n", + data.val[0xF]); return -EIO; - - *y = inw(port2); - *x = inw(port1); - km_activity = inb(HDAPS_PORT_KMACT); - __device_complete(); - - /* hdaps_invert is a bitvector to negate the axes */ - if (hdaps_invert & HDAPS_X_AXIS) - *x = -*x; - if (hdaps_invert & HDAPS_Y_AXIS) - *y = -*y; - + } + *mode = data.val[0x1]; return 0; } -/* - * hdaps_read_pair - reads the values from a pair of ports, placing the values - * in the given pointers. Returns zero on success. Can sleep. +/** + * hdaps_check_ec - checks something about the EC. + * Follows the clean-room spec for HDAPS; we don't know what it means. + * Returns zero on success and negative error code on failure. Can sleep. */ -static int hdaps_read_pair(unsigned int port1, unsigned int port2, - int *val1, int *val2) +static int hdaps_check_ec(void) { - int ret; - - mutex_lock(&hdaps_mtx); - ret = __hdaps_read_pair(port1, port2, val1, val2); - mutex_unlock(&hdaps_mtx); - - return ret; + const struct thinkpad_ec_row args = + { .mask = 0x0003, .val = {0x17, 0x81} }; + struct thinkpad_ec_row data = { .mask = 0x800E }; + int ret = thinkpad_ec_read_row(&args, &data); + if (ret) + return ret; + if (!((data.val[0x1] == 0x00 && data.val[0x2] == 0x60) || /* cleanroom spec */ + (data.val[0x1] == 0x01 && data.val[0x2] == 0x00)) || /* seen on T61 */ + data.val[0x3] != 0x00 || data.val[0xF] != 0x00) { + pr_warn("hdaps_check_ec: bad response (0x%x,0x%x,0x%x,0x%x)\n", + data.val[0x1], data.val[0x2], + data.val[0x3], data.val[0xF]); + return -EIO; + } + return 0; } -/* - * hdaps_device_init - initialize the accelerometer. Returns zero on success - * and negative error code on failure. Can sleep. +/** + * hdaps_device_init - initialize the accelerometer. + * + * Call several embedded controller functions to test and initialize the + * accelerometer. + * Returns zero on success and negative error code on failure. Can sleep. */ +#define FAILED_INIT(msg) pr_err("init failed at: %s\n", msg) static int hdaps_device_init(void) { - int total, ret = -ENXIO; + int ret; + u8 mode; - mutex_lock(&hdaps_mtx); + ret = thinkpad_ec_lock(); + if (ret) + return ret; - outb(0x13, 0x1610); - outb(0x01, 0x161f); - if (__wait_latch(0x161f, 0x00)) - goto out; + if (hdaps_get_ec_mode(&mode)) + { FAILED_INIT("hdaps_get_ec_mode failed"); goto bad; } - /* - * Most ThinkPads return 0x01. - * - * Others--namely the R50p, T41p, and T42p--return 0x03. These laptops - * have "inverted" axises. - * - * The 0x02 value occurs when the chip has been previously initialized. - */ - if (__check_latch(0x1611, 0x03) && - __check_latch(0x1611, 0x02) && - __check_latch(0x1611, 0x01)) - goto out; + pr_debug("initial mode latch is 0x%02x\n", mode); + if (mode == 0x00) + { FAILED_INIT("accelerometer not available"); goto bad; } - printk(KERN_DEBUG "hdaps: initial latch check good (0x%02x)\n", - __get_latch(0x1611)); + if (hdaps_check_ec()) + { FAILED_INIT("hdaps_check_ec failed"); goto bad; } - outb(0x17, 0x1610); - outb(0x81, 0x1611); - outb(0x01, 0x161f); - if (__wait_latch(0x161f, 0x00)) - goto out; - if (__wait_latch(0x1611, 0x00)) - goto out; - if (__wait_latch(0x1612, 0x60)) - goto out; - if (__wait_latch(0x1613, 0x00)) - goto out; - outb(0x14, 0x1610); - outb(0x01, 0x1611); - outb(0x01, 0x161f); - if (__wait_latch(0x161f, 0x00)) - goto out; - outb(0x10, 0x1610); - outb(0xc8, 0x1611); - outb(0x00, 0x1612); - outb(0x02, 0x1613); - outb(0x01, 0x161f); - if (__wait_latch(0x161f, 0x00)) - goto out; - if (__device_refresh_sync()) - goto out; - if (__wait_latch(0x1611, 0x00)) - goto out; + if (hdaps_set_power(1)) + { FAILED_INIT("hdaps_set_power failed"); goto bad; } - /* we have done our dance, now let's wait for the applause */ - for (total = INIT_TIMEOUT_MSECS; total > 0; total -= INIT_WAIT_MSECS) { - int x, y; - - /* a read of the device helps push it into action */ - __hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y); - if (!__wait_latch(0x1611, 0x02)) { - ret = 0; - break; - } + if (hdaps_set_ec_config(sampling_rate*oversampling_ratio, + running_avg_filter_order)) + { FAILED_INIT("hdaps_set_ec_config failed"); goto bad; } - msleep(INIT_WAIT_MSECS); - } + thinkpad_ec_invalidate(); + udelay(200); -out: - mutex_unlock(&hdaps_mtx); + /* Just prefetch instead of reading, to avoid ~1sec delay on load */ + ret = thinkpad_ec_prefetch_row(&ec_accel_args); + if (ret) + { FAILED_INIT("initial prefetch failed"); goto bad; } + goto good; +bad: + thinkpad_ec_invalidate(); + ret = -ENXIO; +good: + stale_readout = 1; + thinkpad_ec_unlock(); return ret; } +/** + * hdaps_device_shutdown - power off the accelerometer + * Returns nonzero on failure. Can sleep. + */ +static int hdaps_device_shutdown(void) +{ + int ret; + ret = hdaps_set_power(0); + if (ret) { + pr_warn("cannot power off\n"); + return ret; + } + ret = hdaps_set_ec_config(0, 1); + if (ret) + pr_warn("cannot stop EC sampling\n"); + return ret; +} /* Device model stuff */ @@ -306,13 +424,29 @@ } #ifdef CONFIG_PM_SLEEP +static int hdaps_suspend(struct device *dev) +{ + /* Don't do hdaps polls until resume re-initializes the sensor. */ + del_timer_sync(&hdaps_timer); + hdaps_device_shutdown(); /* ignore errors, effect is negligible */ + return 0; +} + static int hdaps_resume(struct device *dev) { - return hdaps_device_init(); + int ret = hdaps_device_init(); + if (ret) + return ret; + + mutex_lock(&hdaps_users_mtx); + if (hdaps_users) + mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate); + mutex_unlock(&hdaps_users_mtx); + return 0; } #endif -static SIMPLE_DEV_PM_OPS(hdaps_pm, NULL, hdaps_resume); +static SIMPLE_DEV_PM_OPS(hdaps_pm, hdaps_suspend, hdaps_resume); static struct platform_driver hdaps_driver = { .probe = hdaps_probe, @@ -322,30 +456,47 @@ }, }; -/* - * hdaps_calibrate - Set our "resting" values. Callers must hold hdaps_mtx. +/** + * hdaps_calibrate - set our "resting" values. + * Does its own locking. */ static void hdaps_calibrate(void) { - __hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &rest_x, &rest_y); + needs_calibration = 1; + hdaps_update(); + /* If that fails, the mousedev poll will take care of things later. */ } -static void hdaps_mousedev_poll(struct input_polled_dev *dev) +/* Timer handler for updating the input device. Runs in softirq context, + * so avoid lenghty or blocking operations. + */ +static void hdaps_mousedev_poll(unsigned long unused) { - struct input_dev *input_dev = dev->input; - int x, y; - - mutex_lock(&hdaps_mtx); + int ret; - if (__hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y)) - goto out; + stale_readout = 1; - input_report_abs(input_dev, ABS_X, x - rest_x); - input_report_abs(input_dev, ABS_Y, y - rest_y); - input_sync(input_dev); + /* Cannot sleep. Try nonblockingly. If we fail, try again later. */ + if (thinkpad_ec_try_lock()) + goto keep_active; + + ret = __hdaps_update(1); /* fast update, we're in softirq context */ + thinkpad_ec_unlock(); + /* Any of "successful", "not yet ready" and "not prefetched"? */ + if (ret != 0 && ret != -EBUSY && ret != -ENODATA) { + pr_err("poll failed, disabling updates\n"); + return; + } -out: - mutex_unlock(&hdaps_mtx); +keep_active: + /* Even if we failed now, pos_x,y may have been updated earlier: */ + input_report_abs(hdaps_idev, ABS_X, pos_x - rest_x); + input_report_abs(hdaps_idev, ABS_Y, pos_y - rest_y); + input_sync(hdaps_idev); + input_report_abs(hdaps_idev_raw, ABS_X, pos_x); + input_report_abs(hdaps_idev_raw, ABS_Y, pos_y); + input_sync(hdaps_idev_raw); + mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate); } @@ -354,65 +505,41 @@ static ssize_t hdaps_position_show(struct device *dev, struct device_attribute *attr, char *buf) { - int ret, x, y; - - ret = hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y); - if (ret) - return ret; - - return sprintf(buf, "(%d,%d)\n", x, y); -} - -static ssize_t hdaps_variance_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - int ret, x, y; - - ret = hdaps_read_pair(HDAPS_PORT_XVAR, HDAPS_PORT_YVAR, &x, &y); + int ret = hdaps_update(); if (ret) return ret; - - return sprintf(buf, "(%d,%d)\n", x, y); + return sprintf(buf, "(%d,%d)\n", pos_x, pos_y); } static ssize_t hdaps_temp1_show(struct device *dev, struct device_attribute *attr, char *buf) { - u8 uninitialized_var(temp); - int ret; - - ret = hdaps_readb_one(HDAPS_PORT_TEMP1, &temp); + int ret = hdaps_update(); if (ret) return ret; - - return sprintf(buf, "%u\n", temp); -} - -static ssize_t hdaps_temp2_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u8 uninitialized_var(temp); - int ret; - - ret = hdaps_readb_one(HDAPS_PORT_TEMP2, &temp); - if (ret) - return ret; - - return sprintf(buf, "%u\n", temp); + return sprintf(buf, "%d\n", temperature); } static ssize_t hdaps_keyboard_activity_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", KEYBD_ISSET(km_activity)); + int ret = hdaps_update(); + if (ret) + return ret; + return sprintf(buf, "%u\n", + get_jiffies_64() < last_keyboard_jiffies + KMACT_REMEMBER_PERIOD); } static ssize_t hdaps_mouse_activity_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", MOUSE_ISSET(km_activity)); + int ret = hdaps_update(); + if (ret) + return ret; + return sprintf(buf, "%u\n", + get_jiffies_64() < last_mouse_jiffies + KMACT_REMEMBER_PERIOD); } static ssize_t hdaps_calibrate_show(struct device *dev, @@ -425,10 +552,7 @@ struct device_attribute *attr, const char *buf, size_t count) { - mutex_lock(&hdaps_mtx); hdaps_calibrate(); - mutex_unlock(&hdaps_mtx); - return count; } @@ -445,7 +569,7 @@ int invert; if (sscanf(buf, "%d", &invert) != 1 || - invert < 0 || invert > HDAPS_BOTH_AXES) + invert < 0 || invert > HDAPS_ORIENT_MAX) return -EINVAL; hdaps_invert = invert; @@ -454,24 +578,128 @@ return count; } +static ssize_t hdaps_sampling_rate_show( + struct device *dev, struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", sampling_rate); +} + +static ssize_t hdaps_sampling_rate_store( + struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int rate, ret; + if (sscanf(buf, "%d", &rate) != 1 || rate > HZ || rate <= 0) { + pr_warn("must have 0ident); - return 1; -} - /* hdaps_dmi_match_invert - found an inverted match. */ static int __init hdaps_dmi_match_invert(const struct dmi_system_id *id) { - hdaps_invert = (unsigned long)id->driver_data; - pr_info("inverting axis (%u) readings\n", hdaps_invert); - return hdaps_dmi_match(id); + unsigned int orient = (kernel_ulong_t) id->driver_data; + hdaps_invert = orient; + pr_info("%s detected, setting orientation %u\n", id->ident, orient); + return 1; /* stop enumeration */ } -#define HDAPS_DMI_MATCH_INVERT(vendor, model, axes) { \ +#define HDAPS_DMI_MATCH_INVERT(vendor, model, orient) { \ .ident = vendor " " model, \ .callback = hdaps_dmi_match_invert, \ - .driver_data = (void *)axes, \ + .driver_data = (void *)(orient), \ .matches = { \ DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ DMI_MATCH(DMI_PRODUCT_VERSION, model) \ } \ } -#define HDAPS_DMI_MATCH_NORMAL(vendor, model) \ - HDAPS_DMI_MATCH_INVERT(vendor, model, 0) - -/* Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match - "ThinkPad T42p", so the order of the entries matters. - If your ThinkPad is not recognized, please update to latest - BIOS. This is especially the case for some R52 ThinkPads. */ -static struct dmi_system_id __initdata hdaps_whitelist[] = { - HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R50"), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R51"), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R52"), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61i", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T41"), - HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T42"), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T43"), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61p", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad X40"), - HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_Y_AXIS), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61s", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad Z60m"), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61m", HDAPS_BOTH_AXES), - HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61p", HDAPS_BOTH_AXES), +/* List of models with abnormal axis configuration. + Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match + "ThinkPad T42p", and enumeration stops after first match, + so the order of the entries matters. */ +struct dmi_system_id __initdata hdaps_whitelist[] = { + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X40", HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R400", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R500", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60 Tablet", HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60s", HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400s", HDAPS_ORIENT_INVERT_X), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410s", HDAPS_ORIENT_SWAP), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T500", HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T510", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad W51O", HDAPS_ORIENT_MAX), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201 Tablet", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), + HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X220", HDAPS_ORIENT_SWAP), { .ident = NULL } }; static int __init hdaps_init(void) { - struct input_dev *idev; int ret; - if (!dmi_check_system(hdaps_whitelist)) { - pr_warn("supported laptop not found!\n"); - ret = -ENODEV; - goto out; - } - - if (!request_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS, "hdaps")) { - ret = -ENXIO; - goto out; - } - + /* Determine axis orientation orientation */ + if (hdaps_invert == HDAPS_ORIENT_UNDEFINED) /* set by module param? */ + if (dmi_check_system(hdaps_whitelist) < 1) /* in whitelist? */ + hdaps_invert = 0; /* default */ + + /* Init timer before platform_driver_register, in case of suspend */ + init_timer(&hdaps_timer); + hdaps_timer.function = hdaps_mousedev_poll; ret = platform_driver_register(&hdaps_driver); if (ret) - goto out_region; + goto out; pdev = platform_device_register_simple("hdaps", -1, NULL, 0); if (IS_ERR(pdev)) { @@ -571,47 +792,79 @@ if (ret) goto out_device; - hdaps_idev = input_allocate_polled_device(); + hdaps_idev = input_allocate_device(); if (!hdaps_idev) { ret = -ENOMEM; goto out_group; } - hdaps_idev->poll = hdaps_mousedev_poll; - hdaps_idev->poll_interval = HDAPS_POLL_INTERVAL; + hdaps_idev_raw = input_allocate_device(); + if (!hdaps_idev_raw) { + ret = -ENOMEM; + goto out_idev_first; + } - /* initial calibrate for the input device */ - hdaps_calibrate(); + /* calibration for the input device (deferred to avoid delay) */ + needs_calibration = 1; - /* initialize the input class */ - idev = hdaps_idev->input; - idev->name = "hdaps"; - idev->phys = "isa1600/input0"; - idev->id.bustype = BUS_ISA; - idev->dev.parent = &pdev->dev; - idev->evbit[0] = BIT_MASK(EV_ABS); - input_set_abs_params(idev, ABS_X, + /* initialize the joystick-like fuzzed input device */ + hdaps_idev->name = "ThinkPad HDAPS joystick emulation"; + hdaps_idev->phys = "hdaps/input0"; + hdaps_idev->id.bustype = BUS_HOST; + hdaps_idev->id.vendor = HDAPS_INPUT_VENDOR; + hdaps_idev->id.product = HDAPS_INPUT_PRODUCT; + hdaps_idev->id.version = HDAPS_INPUT_JS_VERSION; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + hdaps_idev->cdev.dev = &pdev->dev; +#endif + hdaps_idev->evbit[0] = BIT(EV_ABS); + hdaps_idev->open = hdaps_mousedev_open; + hdaps_idev->close = hdaps_mousedev_close; + input_set_abs_params(hdaps_idev, ABS_X, -256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT); - input_set_abs_params(idev, ABS_Y, + input_set_abs_params(hdaps_idev, ABS_Y, -256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT); - ret = input_register_polled_device(hdaps_idev); + ret = input_register_device(hdaps_idev); if (ret) goto out_idev; - pr_info("driver successfully loaded\n"); + /* initialize the raw data input device */ + hdaps_idev_raw->name = "ThinkPad HDAPS accelerometer data"; + hdaps_idev_raw->phys = "hdaps/input1"; + hdaps_idev_raw->id.bustype = BUS_HOST; + hdaps_idev_raw->id.vendor = HDAPS_INPUT_VENDOR; + hdaps_idev_raw->id.product = HDAPS_INPUT_PRODUCT; + hdaps_idev_raw->id.version = HDAPS_INPUT_RAW_VERSION; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + hdaps_idev_raw->cdev.dev = &pdev->dev; +#endif + hdaps_idev_raw->evbit[0] = BIT(EV_ABS); + hdaps_idev_raw->open = hdaps_mousedev_open; + hdaps_idev_raw->close = hdaps_mousedev_close; + input_set_abs_params(hdaps_idev_raw, ABS_X, -32768, 32767, 0, 0); + input_set_abs_params(hdaps_idev_raw, ABS_Y, -32768, 32767, 0, 0); + + ret = input_register_device(hdaps_idev_raw); + if (ret) + goto out_idev_reg_first; + + pr_info("driver successfully loaded.\n"); return 0; +out_idev_reg_first: + input_unregister_device(hdaps_idev); out_idev: - input_free_polled_device(hdaps_idev); + input_free_device(hdaps_idev_raw); +out_idev_first: + input_free_device(hdaps_idev); out_group: sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group); out_device: platform_device_unregister(pdev); out_driver: platform_driver_unregister(&hdaps_driver); -out_region: - release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS); + hdaps_device_shutdown(); out: pr_warn("driver init failed (ret=%d)!\n", ret); return ret; @@ -619,12 +872,12 @@ static void __exit hdaps_exit(void) { - input_unregister_polled_device(hdaps_idev); - input_free_polled_device(hdaps_idev); + input_unregister_device(hdaps_idev_raw); + input_unregister_device(hdaps_idev); + hdaps_device_shutdown(); /* ignore errors, effect is negligible */ sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group); platform_device_unregister(pdev); platform_driver_unregister(&hdaps_driver); - release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS); pr_info("driver unloaded\n"); } @@ -632,9 +885,8 @@ module_init(hdaps_init); module_exit(hdaps_exit); -module_param_named(invert, hdaps_invert, int, 0); -MODULE_PARM_DESC(invert, "invert data along each axis. 1 invert x-axis, " - "2 invert y-axis, 3 invert both axes."); +module_param_named(invert, hdaps_invert, uint, 0); +MODULE_PARM_DESC(invert, "axis orientation code"); MODULE_AUTHOR("Robert Love"); MODULE_DESCRIPTION("IBM Hard Drive Active Protection System (HDAPS) driver"); diff -Nur linux-4.2/drivers/platform/x86/thinkpad_ec.c linux-4.2-pck/drivers/platform/x86/thinkpad_ec.c --- linux-4.2/drivers/platform/x86/thinkpad_ec.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/drivers/platform/x86/thinkpad_ec.c 2015-09-08 00:48:22.225030705 -0300 @@ -0,0 +1,513 @@ +/* + * thinkpad_ec.c - ThinkPad embedded controller LPC3 functions + * + * The embedded controller on ThinkPad laptops has a non-standard interface, + * where LPC channel 3 of the H8S EC chip is hooked up to IO ports + * 0x1600-0x161F and implements (a special case of) the H8S LPC protocol. + * The EC LPC interface provides various system management services (currently + * known: battery information and accelerometer readouts). This driver + * provides access and mutual exclusion for the EC interface. +* + * The LPC protocol and terminology are documented here: + * "H8S/2104B Group Hardware Manual", + * http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf + * + * Copyright (C) 2006-2007 Shem Multinymous + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) + #include +#else + #include +#endif + +#define TP_VERSION "0.41" + +MODULE_AUTHOR("Shem Multinymous"); +MODULE_DESCRIPTION("ThinkPad embedded controller hardware access"); +MODULE_VERSION(TP_VERSION); +MODULE_LICENSE("GPL"); + +/* IO ports used by embedded controller LPC channel 3: */ +#define TPC_BASE_PORT 0x1600 +#define TPC_NUM_PORTS 0x20 +#define TPC_STR3_PORT 0x1604 /* Reads H8S EC register STR3 */ +#define TPC_TWR0_PORT 0x1610 /* Mapped to H8S EC register TWR0MW/SW */ +#define TPC_TWR15_PORT 0x161F /* Mapped to H8S EC register TWR15. */ + /* (and port TPC_TWR0_PORT+i is mapped to H8S reg TWRi for 00x%02x", \ + msg, args->val[0x0], args->val[0xF], code) + +/* State of request prefetching: */ +static u8 prefetch_arg0, prefetch_argF; /* Args of last prefetch */ +static u64 prefetch_jiffies; /* time of prefetch, or: */ +#define TPC_PREFETCH_NONE INITIAL_JIFFIES /* No prefetch */ +#define TPC_PREFETCH_JUNK (INITIAL_JIFFIES+1) /* Ignore prefetch */ + +/* Locking: */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) +static DECLARE_MUTEX(thinkpad_ec_mutex); +#else +static DEFINE_SEMAPHORE(thinkpad_ec_mutex); +#endif + +/* Kludge in case the ACPI DSDT reserves the ports we need. */ +static bool force_io; /* Willing to do IO to ports we couldn't reserve? */ +static int reserved_io; /* Successfully reserved the ports? */ +module_param_named(force_io, force_io, bool, 0600); +MODULE_PARM_DESC(force_io, "Force IO even if region already reserved (0=off, 1=on)"); + +/** + * thinkpad_ec_lock - get lock on the ThinkPad EC + * + * Get exclusive lock for accesing the ThinkPad embedded controller LPC3 + * interface. Returns 0 iff lock acquired. + */ +int thinkpad_ec_lock(void) +{ + int ret; + ret = down_interruptible(&thinkpad_ec_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(thinkpad_ec_lock); + +/** + * thinkpad_ec_try_lock - try getting lock on the ThinkPad EC + * + * Try getting an exclusive lock for accesing the ThinkPad embedded + * controller LPC3. Returns immediately if lock is not available; neither + * blocks nor sleeps. Returns 0 iff lock acquired . + */ +int thinkpad_ec_try_lock(void) +{ + return down_trylock(&thinkpad_ec_mutex); +} +EXPORT_SYMBOL_GPL(thinkpad_ec_try_lock); + +/** + * thinkpad_ec_unlock - release lock on ThinkPad EC + * + * Release a previously acquired exclusive lock on the ThinkPad ebmedded + * controller LPC3 interface. + */ +void thinkpad_ec_unlock(void) +{ + up(&thinkpad_ec_mutex); +} +EXPORT_SYMBOL_GPL(thinkpad_ec_unlock); + +/** + * thinkpad_ec_request_row - tell embedded controller to prepare a row + * @args Input register arguments + * + * Requests a data row by writing to H8S LPC registers TRW0 through TWR15 (or + * a subset thereof) following the protocol prescribed by the "H8S/2104B Group + * Hardware Manual". Does sanity checks via status register STR3. + */ +static int thinkpad_ec_request_row(const struct thinkpad_ec_row *args) +{ + u8 str3; + int i; + + /* EC protocol requires write to TWR0 (function code): */ + if (!(args->mask & 0x0001)) { + printk(KERN_ERR MSG_FMT("bad args->mask=0x%02x", args->mask)); + return -EINVAL; + } + + /* Check initial STR3 status: */ + str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; + if (str3 & H8S_STR3_OBF3B) { /* data already pending */ + inb(TPC_TWR15_PORT); /* marks end of previous transaction */ + if (prefetch_jiffies == TPC_PREFETCH_NONE) + printk(KERN_WARNING REQ_FMT( + "EC has result from unrequested transaction", + str3)); + return -EBUSY; /* EC will be ready in a few usecs */ + } else if (str3 == H8S_STR3_SWMF) { /* busy with previous request */ + if (prefetch_jiffies == TPC_PREFETCH_NONE) + printk(KERN_WARNING REQ_FMT( + "EC is busy with unrequested transaction", + str3)); + return -EBUSY; /* data will be pending in a few usecs */ + } else if (str3 != 0x00) { /* unexpected status? */ + printk(KERN_WARNING REQ_FMT("unexpected initial STR3", str3)); + return -EIO; + } + + /* Send TWR0MW: */ + outb(args->val[0], TPC_TWR0_PORT); + str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; + if (str3 != H8S_STR3_MWMF) { /* not accepted? */ + printk(KERN_WARNING REQ_FMT("arg0 rejected", str3)); + return -EIO; + } + + /* Send TWR1 through TWR14: */ + for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) + if ((args->mask>>i)&1) + outb(args->val[i], TPC_TWR0_PORT+i); + + /* Send TWR15 (default to 0x01). This marks end of command. */ + outb((args->mask & 0x8000) ? args->val[0xF] : 0x01, TPC_TWR15_PORT); + + /* Wait until EC starts writing its reply (~60ns on average). + * Releasing locks before this happens may cause an EC hang + * due to firmware bug! + */ + for (i = 0; i < TPC_REQUEST_RETRIES; i++) { + str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; + if (str3 & H8S_STR3_SWMF) /* EC started replying */ + return 0; + else if (!(str3 & ~(H8S_STR3_IBF3B|H8S_STR3_MWMF))) + /* Normal progress (the EC hasn't seen the request + * yet, or is processing it). Wait it out. */ + ndelay(TPC_REQUEST_NDELAY); + else { /* weird EC status */ + printk(KERN_WARNING + REQ_FMT("bad end STR3", str3)); + return -EIO; + } + } + printk(KERN_WARNING REQ_FMT("EC is mysteriously silent", str3)); + return -EIO; +} + +/** + * thinkpad_ec_read_data - read pre-requested row-data from EC + * @args Input register arguments of pre-requested rows + * @data Output register values + * + * Reads current row data from the controller, assuming it's already + * requested. Follows the H8S spec for register access and status checks. + */ +static int thinkpad_ec_read_data(const struct thinkpad_ec_row *args, + struct thinkpad_ec_row *data) +{ + int i; + u8 str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; + /* Once we make a request, STR3 assumes the sequence of values listed + * in the following 'if' as it reads the request and writes its data. + * It takes about a few dozen nanosecs total, with very high variance. + */ + if (str3 == (H8S_STR3_IBF3B|H8S_STR3_MWMF) || + str3 == 0x00 || /* the 0x00 is indistinguishable from idle EC! */ + str3 == H8S_STR3_SWMF) + return -EBUSY; /* not ready yet */ + /* Finally, the EC signals output buffer full: */ + if (str3 != (H8S_STR3_OBF3B|H8S_STR3_SWMF)) { + printk(KERN_WARNING + REQ_FMT("bad initial STR3", str3)); + return -EIO; + } + + /* Read first byte (signals start of read transactions): */ + data->val[0] = inb(TPC_TWR0_PORT); + /* Optionally read 14 more bytes: */ + for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) + if ((data->mask >> i)&1) + data->val[i] = inb(TPC_TWR0_PORT+i); + /* Read last byte from 0x161F (signals end of read transaction): */ + data->val[0xF] = inb(TPC_TWR15_PORT); + + /* Readout still pending? */ + str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; + if (str3 & H8S_STR3_OBF3B) + printk(KERN_WARNING + REQ_FMT("OBF3B=1 after read", str3)); + /* If port 0x161F returns 0x80 too often, the EC may lock up. Warn: */ + if (data->val[0xF] == 0x80) + printk(KERN_WARNING + REQ_FMT("0x161F reports error", data->val[0xF])); + return 0; +} + +/** + * thinkpad_ec_is_row_fetched - is the given row currently prefetched? + * + * To keep things simple we compare only the first and last args; + * this suffices for all known cases. + */ +static int thinkpad_ec_is_row_fetched(const struct thinkpad_ec_row *args) +{ + return (prefetch_jiffies != TPC_PREFETCH_NONE) && + (prefetch_jiffies != TPC_PREFETCH_JUNK) && + (prefetch_arg0 == args->val[0]) && + (prefetch_argF == args->val[0xF]) && + (get_jiffies_64() < prefetch_jiffies + TPC_PREFETCH_TIMEOUT); +} + +/** + * thinkpad_ec_read_row - request and read data from ThinkPad EC + * @args Input register arguments + * @data Output register values + * + * Read a data row from the ThinkPad embedded controller LPC3 interface. + * Does fetching and retrying if needed. The row is specified by an + * array of 16 bytes, some of which may be undefined (but the first is + * mandatory). These bytes are given in @args->val[], where @args->val[i] is + * used iff (@args->mask>>i)&1). The resulting row data is stored in + * @data->val[], but is only guaranteed to be valid for indices corresponding + * to set bit in @data->mask. That is, if @data->mask&(1<val[i] is undefined. + * + * Returns -EBUSY on transient error and -EIO on abnormal condition. + * Caller must hold controller lock. + */ +int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, + struct thinkpad_ec_row *data) +{ + int retries, ret; + + if (thinkpad_ec_is_row_fetched(args)) + goto read_row; /* already requested */ + + /* Request the row */ + for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { + ret = thinkpad_ec_request_row(args); + if (!ret) + goto read_row; + if (ret != -EBUSY) + break; + ndelay(TPC_READ_NDELAY); + } + printk(KERN_ERR REQ_FMT("failed requesting row", ret)); + goto out; + +read_row: + /* Read the row's data */ + for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { + ret = thinkpad_ec_read_data(args, data); + if (!ret) + goto out; + if (ret != -EBUSY) + break; + ndelay(TPC_READ_NDELAY); + } + + printk(KERN_ERR REQ_FMT("failed waiting for data", ret)); + +out: + prefetch_jiffies = TPC_PREFETCH_JUNK; + return ret; +} +EXPORT_SYMBOL_GPL(thinkpad_ec_read_row); + +/** + * thinkpad_ec_try_read_row - try reading prefetched data from ThinkPad EC + * @args Input register arguments + * @data Output register values + * + * Try reading a data row from the ThinkPad embedded controller LPC3 + * interface, if this raw was recently prefetched using + * thinkpad_ec_prefetch_row(). Does not fetch, retry or block. + * The parameters have the same meaning as in thinkpad_ec_read_row(). + * + * Returns -EBUSY is data not ready and -ENODATA if row not prefetched. + * Caller must hold controller lock. + */ +int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, + struct thinkpad_ec_row *data) +{ + int ret; + if (!thinkpad_ec_is_row_fetched(args)) { + ret = -ENODATA; + } else { + ret = thinkpad_ec_read_data(args, data); + if (!ret) + prefetch_jiffies = TPC_PREFETCH_NONE; /* eaten up */ + } + return ret; +} +EXPORT_SYMBOL_GPL(thinkpad_ec_try_read_row); + +/** + * thinkpad_ec_prefetch_row - prefetch data from ThinkPad EC + * @args Input register arguments + * + * Prefetch a data row from the ThinkPad embedded controller LCP3 + * interface. A subsequent call to thinkpad_ec_read_row() with the + * same arguments will be faster, and a subsequent call to + * thinkpad_ec_try_read_row() stands a good chance of succeeding if + * done neither too soon nor too late. See + * thinkpad_ec_read_row() for the meaning of @args. + * + * Returns -EBUSY on transient error and -EIO on abnormal condition. + * Caller must hold controller lock. + */ +int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args) +{ + int ret; + ret = thinkpad_ec_request_row(args); + if (ret) { + prefetch_jiffies = TPC_PREFETCH_JUNK; + } else { + prefetch_jiffies = get_jiffies_64(); + prefetch_arg0 = args->val[0x0]; + prefetch_argF = args->val[0xF]; + } + return ret; +} +EXPORT_SYMBOL_GPL(thinkpad_ec_prefetch_row); + +/** + * thinkpad_ec_invalidate - invalidate prefetched ThinkPad EC data + * + * Invalidate the data prefetched via thinkpad_ec_prefetch_row() from the + * ThinkPad embedded controller LPC3 interface. + * Must be called before unlocking by any code that accesses the controller + * ports directly. + */ +void thinkpad_ec_invalidate(void) +{ + prefetch_jiffies = TPC_PREFETCH_JUNK; +} +EXPORT_SYMBOL_GPL(thinkpad_ec_invalidate); + + +/*** Checking for EC hardware ***/ + +/** + * thinkpad_ec_test - verify the EC is present and follows protocol + * + * Ensure the EC LPC3 channel really works on this machine by making + * an EC request and seeing if the EC follows the documented H8S protocol. + * The requested row just reads battery status, so it should be harmless to + * access it (on a correct EC). + * This test writes to IO ports, so execute only after checking DMI. + */ +static int __init thinkpad_ec_test(void) +{ + int ret; + const struct thinkpad_ec_row args = /* battery 0 basic status */ + { .mask = 0x8001, .val = {0x01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x00} }; + struct thinkpad_ec_row data = { .mask = 0x0000 }; + ret = thinkpad_ec_lock(); + if (ret) + return ret; + ret = thinkpad_ec_read_row(&args, &data); + thinkpad_ec_unlock(); + return ret; +} + +/* Search all DMI device names of a given type for a substring */ +static int __init dmi_find_substring(int type, const char *substr) +{ + const struct dmi_device *dev = NULL; + while ((dev = dmi_find_device(type, NULL, dev))) { + if (strstr(dev->name, substr)) + return 1; + } + return 0; +} + +#define TP_DMI_MATCH(vendor,model) { \ + .ident = vendor " " model, \ + .matches = { \ + DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ + DMI_MATCH(DMI_PRODUCT_VERSION, model) \ + } \ +} + +/* Check DMI for existence of ThinkPad embedded controller */ +static int __init check_dmi_for_ec(void) +{ + /* A few old models that have a good EC but don't report it in DMI */ + struct dmi_system_id tp_whitelist[] = { + TP_DMI_MATCH("IBM", "ThinkPad A30"), + TP_DMI_MATCH("IBM", "ThinkPad T23"), + TP_DMI_MATCH("IBM", "ThinkPad X24"), + TP_DMI_MATCH("LENOVO", "ThinkPad"), + { .ident = NULL } + }; + return dmi_find_substring(DMI_DEV_TYPE_OEM_STRING, + "IBM ThinkPad Embedded Controller") || + dmi_check_system(tp_whitelist); +} + +/*** Init and cleanup ***/ + +static int __init thinkpad_ec_init(void) +{ + if (!check_dmi_for_ec()) { + printk(KERN_WARNING + "thinkpad_ec: no ThinkPad embedded controller!\n"); + return -ENODEV; + } + + if (request_region(TPC_BASE_PORT, TPC_NUM_PORTS, "thinkpad_ec")) { + reserved_io = 1; + } else { + printk(KERN_ERR "thinkpad_ec: cannot claim IO ports %#x-%#x... ", + TPC_BASE_PORT, + TPC_BASE_PORT + TPC_NUM_PORTS - 1); + if (force_io) { + printk("forcing use of unreserved IO ports.\n"); + } else { + printk("consider using force_io=1.\n"); + return -ENXIO; + } + } + prefetch_jiffies = TPC_PREFETCH_JUNK; + if (thinkpad_ec_test()) { + printk(KERN_ERR "thinkpad_ec: initial ec test failed\n"); + if (reserved_io) + release_region(TPC_BASE_PORT, TPC_NUM_PORTS); + return -ENXIO; + } + printk(KERN_INFO "thinkpad_ec: thinkpad_ec " TP_VERSION " loaded.\n"); + return 0; +} + +static void __exit thinkpad_ec_exit(void) +{ + if (reserved_io) + release_region(TPC_BASE_PORT, TPC_NUM_PORTS); + printk(KERN_INFO "thinkpad_ec: unloaded.\n"); +} + +module_init(thinkpad_ec_init); +module_exit(thinkpad_ec_exit); diff -Nur linux-4.2/drivers/platform/x86/tp_smapi.c linux-4.2-pck/drivers/platform/x86/tp_smapi.c --- linux-4.2/drivers/platform/x86/tp_smapi.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/drivers/platform/x86/tp_smapi.c 2015-09-08 00:48:22.225030705 -0300 @@ -0,0 +1,1493 @@ +/* + * tp_smapi.c - ThinkPad SMAPI support + * + * This driver exposes some features of the System Management Application + * Program Interface (SMAPI) BIOS found on ThinkPad laptops. It works on + * models in which the SMAPI BIOS runs in SMM and is invoked by writing + * to the APM control port 0xB2. + * It also exposes battery status information, obtained from the ThinkPad + * embedded controller (via the thinkpad_ec module). + * Ancient ThinkPad models use a different interface, supported by the + * "thinkpad" module from "tpctl". + * + * Many of the battery status values obtained from the EC simply mirror + * values provided by the battery's Smart Battery System (SBS) interface, so + * their meaning is defined by the Smart Battery Data Specification (see + * http://sbs-forum.org/specs/sbdat110.pdf). References to this SBS spec + * are given in the code where relevant. + * + * Copyright (C) 2006 Shem Multinymous . + * SMAPI access code based on the mwave driver by Mike Sullivan. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include /* CMOS defines */ +#include +#include +#include +#include +#include +#include + +#define TP_VERSION "0.41" +#define TP_DESC "ThinkPad SMAPI Support" +#define TP_DIR "smapi" + +MODULE_AUTHOR("Shem Multinymous"); +MODULE_DESCRIPTION(TP_DESC); +MODULE_VERSION(TP_VERSION); +MODULE_LICENSE("GPL"); + +static struct platform_device *pdev; + +static int tp_debug; +module_param_named(debug, tp_debug, int, 0600); +MODULE_PARM_DESC(debug, "Debug level (0=off, 1=on)"); + +/* A few macros for printk()ing: */ +#define TPRINTK(level, fmt, args...) \ + dev_printk(level, &(pdev->dev), "%s: " fmt "\n", __func__, ## args) +#define DPRINTK(fmt, args...) \ + do { if (tp_debug) TPRINTK(KERN_DEBUG, fmt, ## args); } while (0) + +/********************************************************************* + * SMAPI interface + */ + +/* SMAPI functions (register BX when making the SMM call). */ +#define SMAPI_GET_INHIBIT_CHARGE 0x2114 +#define SMAPI_SET_INHIBIT_CHARGE 0x2115 +#define SMAPI_GET_THRESH_START 0x2116 +#define SMAPI_SET_THRESH_START 0x2117 +#define SMAPI_GET_FORCE_DISCHARGE 0x2118 +#define SMAPI_SET_FORCE_DISCHARGE 0x2119 +#define SMAPI_GET_THRESH_STOP 0x211a +#define SMAPI_SET_THRESH_STOP 0x211b + +/* SMAPI error codes (see ThinkPad 770 Technical Reference Manual p.83 at + http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD */ +#define SMAPI_RETCODE_EOF 0xff +static struct { u8 rc; char *msg; int ret; } smapi_retcode[] = +{ + {0x00, "OK", 0}, + {0x53, "SMAPI function is not available", -ENXIO}, + {0x81, "Invalid parameter", -EINVAL}, + {0x86, "Function is not supported by SMAPI BIOS", -EOPNOTSUPP}, + {0x90, "System error", -EIO}, + {0x91, "System is invalid", -EIO}, + {0x92, "System is busy, -EBUSY"}, + {0xa0, "Device error (disk read error)", -EIO}, + {0xa1, "Device is busy", -EBUSY}, + {0xa2, "Device is not attached", -ENXIO}, + {0xa3, "Device is disbled", -EIO}, + {0xa4, "Request parameter is out of range", -EINVAL}, + {0xa5, "Request parameter is not accepted", -EINVAL}, + {0xa6, "Transient error", -EBUSY}, /* ? */ + {SMAPI_RETCODE_EOF, "Unknown error code", -EIO} +}; + + +#define SMAPI_MAX_RETRIES 10 +#define SMAPI_PORT2 0x4F /* fixed port, meaning unclear */ +static unsigned short smapi_port; /* APM control port, normally 0xB2 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) +static DECLARE_MUTEX(smapi_mutex); +#else +static DEFINE_SEMAPHORE(smapi_mutex); +#endif + +/** + * find_smapi_port - read SMAPI port from NVRAM + */ +static int __init find_smapi_port(void) +{ + u16 smapi_id = 0; + unsigned short port = 0; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + smapi_id = CMOS_READ(0x7C); + smapi_id |= (CMOS_READ(0x7D) << 8); + spin_unlock_irqrestore(&rtc_lock, flags); + + if (smapi_id != 0x5349) { + printk(KERN_ERR "SMAPI not supported (ID=0x%x)\n", smapi_id); + return -ENXIO; + } + spin_lock_irqsave(&rtc_lock, flags); + port = CMOS_READ(0x7E); + port |= (CMOS_READ(0x7F) << 8); + spin_unlock_irqrestore(&rtc_lock, flags); + if (port == 0) { + printk(KERN_ERR "unable to read SMAPI port number\n"); + return -ENXIO; + } + return port; +} + +/** + * smapi_request - make a SMAPI call + * @inEBX, @inECX, @inEDI, @inESI: input registers + * @outEBX, @outECX, @outEDX, @outEDI, @outESI: outputs registers + * @msg: textual error message + * Invokes the SMAPI SMBIOS with the given input and outpu args. + * All outputs are optional (can be %NULL). + * Returns 0 when successful, and a negative errno constant + * (see smapi_retcode above) upon failure. + */ +static int smapi_request(u32 inEBX, u32 inECX, + u32 inEDI, u32 inESI, + u32 *outEBX, u32 *outECX, u32 *outEDX, + u32 *outEDI, u32 *outESI, const char **msg) +{ + int ret = 0; + int i; + int retries; + u8 rc; + /* Must use local vars for output regs, due to reg pressure. */ + u32 tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI; + + for (retries = 0; retries < SMAPI_MAX_RETRIES; ++retries) { + DPRINTK("req_in: BX=%x CX=%x DI=%x SI=%x", + inEBX, inECX, inEDI, inESI); + + /* SMAPI's SMBIOS call and thinkpad_ec end up using use + * different interfaces to the same chip, so play it safe. */ + ret = thinkpad_ec_lock(); + if (ret) + return ret; + + __asm__ __volatile__( + "movl $0x00005380,%%eax\n\t" + "movl %6,%%ebx\n\t" + "movl %7,%%ecx\n\t" + "movl %8,%%edi\n\t" + "movl %9,%%esi\n\t" + "xorl %%edx,%%edx\n\t" + "movw %10,%%dx\n\t" + "out %%al,%%dx\n\t" /* trigger SMI to SMBIOS */ + "out %%al,$0x4F\n\t" + "movl %%eax,%0\n\t" + "movl %%ebx,%1\n\t" + "movl %%ecx,%2\n\t" + "movl %%edx,%3\n\t" + "movl %%edi,%4\n\t" + "movl %%esi,%5\n\t" + :"=m"(tmpEAX), + "=m"(tmpEBX), + "=m"(tmpECX), + "=m"(tmpEDX), + "=m"(tmpEDI), + "=m"(tmpESI) + :"m"(inEBX), "m"(inECX), "m"(inEDI), "m"(inESI), + "m"((u16)smapi_port) + :"%eax", "%ebx", "%ecx", "%edx", "%edi", + "%esi"); + + thinkpad_ec_invalidate(); + thinkpad_ec_unlock(); + + /* Don't let the next SMAPI access happen too quickly, + * may case problems. (We're hold smapi_mutex). */ + msleep(50); + + if (outEBX) *outEBX = tmpEBX; + if (outECX) *outECX = tmpECX; + if (outEDX) *outEDX = tmpEDX; + if (outESI) *outESI = tmpESI; + if (outEDI) *outEDI = tmpEDI; + + /* Look up error code */ + rc = (tmpEAX>>8)&0xFF; + for (i = 0; smapi_retcode[i].rc != SMAPI_RETCODE_EOF && + smapi_retcode[i].rc != rc; ++i) {} + ret = smapi_retcode[i].ret; + if (msg) + *msg = smapi_retcode[i].msg; + + DPRINTK("req_out: AX=%x BX=%x CX=%x DX=%x DI=%x SI=%x r=%d", + tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI, ret); + if (ret) + TPRINTK(KERN_NOTICE, "SMAPI error: %s (func=%x)", + smapi_retcode[i].msg, inEBX); + + if (ret != -EBUSY) + return ret; + } + return ret; +} + +/* Convenience wrapper: discard output arguments */ +static int smapi_write(u32 inEBX, u32 inECX, + u32 inEDI, u32 inESI, const char **msg) +{ + return smapi_request(inEBX, inECX, inEDI, inESI, + NULL, NULL, NULL, NULL, NULL, msg); +} + + +/********************************************************************* + * Specific SMAPI services + * All of these functions return 0 upon success, and a negative errno + * constant (see smapi_retcode) on failure. + */ + +enum thresh_type { + THRESH_STOP = 0, /* the code assumes this is 0 for brevity */ + THRESH_START +}; +#define THRESH_NAME(which) ((which == THRESH_START) ? "start" : "stop") + +/** + * __get_real_thresh - read battery charge start/stop threshold from SMAPI + * @bat: battery number (0 or 1) + * @which: THRESH_START or THRESH_STOP + * @thresh: 1..99, 0=default 1..99, 0=default (pass this as-is to SMAPI) + * @outEDI: some additional state that needs to be preserved, meaning unknown + * @outESI: some additional state that needs to be preserved, meaning unknown + */ +static int __get_real_thresh(int bat, enum thresh_type which, int *thresh, + u32 *outEDI, u32 *outESI) +{ + u32 ebx = (which == THRESH_START) ? SMAPI_GET_THRESH_START + : SMAPI_GET_THRESH_STOP; + u32 ecx = (bat+1)<<8; + const char *msg; + int ret = smapi_request(ebx, ecx, 0, 0, NULL, + &ecx, NULL, outEDI, outESI, &msg); + if (ret) { + TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: %s", + THRESH_NAME(which), bat, msg); + return ret; + } + if (!(ecx&0x00000100)) { + TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: ecx=0%x", + THRESH_NAME(which), bat, ecx); + return -EIO; + } + if (thresh) + *thresh = ecx&0xFF; + return 0; +} + +/** + * get_real_thresh - read battery charge start/stop threshold from SMAPI + * @bat: battery number (0 or 1) + * @which: THRESH_START or THRESH_STOP + * @thresh: 1..99, 0=default (passes as-is to SMAPI) + */ +static int get_real_thresh(int bat, enum thresh_type which, int *thresh) +{ + return __get_real_thresh(bat, which, thresh, NULL, NULL); +} + +/** + * set_real_thresh - write battery start/top charge threshold to SMAPI + * @bat: battery number (0 or 1) + * @which: THRESH_START or THRESH_STOP + * @thresh: 1..99, 0=default (passes as-is to SMAPI) + */ +static int set_real_thresh(int bat, enum thresh_type which, int thresh) +{ + u32 ebx = (which == THRESH_START) ? SMAPI_SET_THRESH_START + : SMAPI_SET_THRESH_STOP; + u32 ecx = ((bat+1)<<8) + thresh; + u32 getDI, getSI; + const char *msg; + int ret; + + /* verify read before writing */ + ret = __get_real_thresh(bat, which, NULL, &getDI, &getSI); + if (ret) + return ret; + + ret = smapi_write(ebx, ecx, getDI, getSI, &msg); + if (ret) + TPRINTK(KERN_NOTICE, "set %s to %d for bat=%d failed: %s", + THRESH_NAME(which), thresh, bat, msg); + else + TPRINTK(KERN_INFO, "set %s to %d for bat=%d", + THRESH_NAME(which), thresh, bat); + return ret; +} + +/** + * __get_inhibit_charge_minutes - get inhibit charge period from SMAPI + * @bat: battery number (0 or 1) + * @minutes: period in minutes (1..65535 minutes, 0=disabled) + * @outECX: some additional state that needs to be preserved, meaning unknown + * Note that @minutes is the originally set value, it does not count down. + */ +static int __get_inhibit_charge_minutes(int bat, int *minutes, u32 *outECX) +{ + u32 ecx = (bat+1)<<8; + u32 esi; + const char *msg; + int ret = smapi_request(SMAPI_GET_INHIBIT_CHARGE, ecx, 0, 0, + NULL, &ecx, NULL, NULL, &esi, &msg); + if (ret) { + TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); + return ret; + } + if (!(ecx&0x0100)) { + TPRINTK(KERN_NOTICE, "bad ecx=0x%x for bat=%d", ecx, bat); + return -EIO; + } + if (minutes) + *minutes = (ecx&0x0001)?esi:0; + if (outECX) + *outECX = ecx; + return 0; +} + +/** + * get_inhibit_charge_minutes - get inhibit charge period from SMAPI + * @bat: battery number (0 or 1) + * @minutes: period in minutes (1..65535 minutes, 0=disabled) + * Note that @minutes is the originally set value, it does not count down. + */ +static int get_inhibit_charge_minutes(int bat, int *minutes) +{ + return __get_inhibit_charge_minutes(bat, minutes, NULL); +} + +/** + * set_inhibit_charge_minutes - write inhibit charge period to SMAPI + * @bat: battery number (0 or 1) + * @minutes: period in minutes (1..65535 minutes, 0=disabled) + */ +static int set_inhibit_charge_minutes(int bat, int minutes) +{ + u32 ecx; + const char *msg; + int ret; + + /* verify read before writing */ + ret = __get_inhibit_charge_minutes(bat, NULL, &ecx); + if (ret) + return ret; + + ecx = ((bat+1)<<8) | (ecx&0x00FE) | (minutes > 0 ? 0x0001 : 0x0000); + if (minutes > 0xFFFF) + minutes = 0xFFFF; + ret = smapi_write(SMAPI_SET_INHIBIT_CHARGE, ecx, 0, minutes, &msg); + if (ret) + TPRINTK(KERN_NOTICE, + "set to %d failed for bat=%d: %s", minutes, bat, msg); + else + TPRINTK(KERN_INFO, "set to %d for bat=%d\n", minutes, bat); + return ret; +} + + +/** + * get_force_discharge - get status of forced discharging from SMAPI + * @bat: battery number (0 or 1) + * @enabled: 1 if forced discharged is enabled, 0 if not + */ +static int get_force_discharge(int bat, int *enabled) +{ + u32 ecx = (bat+1)<<8; + const char *msg; + int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, + NULL, &ecx, NULL, NULL, NULL, &msg); + if (ret) { + TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); + return ret; + } + *enabled = (!(ecx&0x00000100) && (ecx&0x00000001))?1:0; + return 0; +} + +/** + * set_force_discharge - write status of forced discharging to SMAPI + * @bat: battery number (0 or 1) + * @enabled: 1 if forced discharged is enabled, 0 if not + */ +static int set_force_discharge(int bat, int enabled) +{ + u32 ecx = (bat+1)<<8; + const char *msg; + int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, + NULL, &ecx, NULL, NULL, NULL, &msg); + if (ret) { + TPRINTK(KERN_NOTICE, "get failed for bat=%d: %s", bat, msg); + return ret; + } + if (ecx&0x00000100) { + TPRINTK(KERN_NOTICE, "cannot force discharge bat=%d", bat); + return -EIO; + } + + ecx = ((bat+1)<<8) | (ecx&0x000000FA) | (enabled?0x00000001:0); + ret = smapi_write(SMAPI_SET_FORCE_DISCHARGE, ecx, 0, 0, &msg); + if (ret) + TPRINTK(KERN_NOTICE, "set to %d failed for bat=%d: %s", + enabled, bat, msg); + else + TPRINTK(KERN_INFO, "set to %d for bat=%d", enabled, bat); + return ret; +} + + +/********************************************************************* + * Wrappers to threshold-related SMAPI functions, which handle default + * thresholds and related quirks. + */ + +/* Minimum, default and minimum difference for battery charging thresholds: */ +#define MIN_THRESH_DELTA 4 /* Min delta between start and stop thresh */ +#define MIN_THRESH_START 2 +#define MAX_THRESH_START (100-MIN_THRESH_DELTA) +#define MIN_THRESH_STOP (MIN_THRESH_START + MIN_THRESH_DELTA) +#define MAX_THRESH_STOP 100 +#define DEFAULT_THRESH_START MAX_THRESH_START +#define DEFAULT_THRESH_STOP MAX_THRESH_STOP + +/* The GUI of IBM's Battery Maximizer seems to show a start threshold that + * is 1 more than the value we set/get via SMAPI. Since the threshold is + * maintained across reboot, this can be confusing. So we kludge our + * interface for interoperability: */ +#define BATMAX_FIX 1 + +/* Get charge start/stop threshold (1..100), + * substituting default values if needed and applying BATMAT_FIX. */ +static int get_thresh(int bat, enum thresh_type which, int *thresh) +{ + int ret = get_real_thresh(bat, which, thresh); + if (ret) + return ret; + if (*thresh == 0) + *thresh = (which == THRESH_START) ? DEFAULT_THRESH_START + : DEFAULT_THRESH_STOP; + else if (which == THRESH_START) + *thresh += BATMAX_FIX; + return 0; +} + + +/* Set charge start/stop threshold (1..100), + * substituting default values if needed and applying BATMAT_FIX. */ +static int set_thresh(int bat, enum thresh_type which, int thresh) +{ + if (which == THRESH_STOP && thresh == DEFAULT_THRESH_STOP) + thresh = 0; /* 100 is out of range, but default means 100 */ + if (which == THRESH_START) + thresh -= BATMAX_FIX; + return set_real_thresh(bat, which, thresh); +} + +/********************************************************************* + * ThinkPad embedded controller readout and basic functions + */ + +/** + * read_tp_ec_row - read data row from the ThinkPad embedded controller + * @arg0: EC command code + * @bat: battery number, 0 or 1 + * @j: the byte value to be used for "junk" (unused) input/outputs + * @dataval: result vector + */ +static int read_tp_ec_row(u8 arg0, int bat, u8 j, u8 *dataval) +{ + int ret; + const struct thinkpad_ec_row args = { .mask = 0xFFFF, + .val = {arg0, j,j,j,j,j,j,j,j,j,j,j,j,j,j, (u8)bat} }; + struct thinkpad_ec_row data = { .mask = 0xFFFF }; + + ret = thinkpad_ec_lock(); + if (ret) + return ret; + ret = thinkpad_ec_read_row(&args, &data); + thinkpad_ec_unlock(); + memcpy(dataval, &data.val, TP_CONTROLLER_ROW_LEN); + return ret; +} + +/** + * power_device_present - check for presence of battery or AC power + * @bat: 0 for battery 0, 1 for battery 1, otherwise AC power + * Returns 1 if present, 0 if not present, negative if error. + */ +static int power_device_present(int bat) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + u8 test; + int ret = read_tp_ec_row(1, bat, 0, row); + if (ret) + return ret; + switch (bat) { + case 0: test = 0x40; break; /* battery 0 */ + case 1: test = 0x20; break; /* battery 1 */ + default: test = 0x80; /* AC power */ + } + return (row[0] & test) ? 1 : 0; +} + +/** + * bat_has_status - check if battery can report detailed status + * @bat: 0 for battery 0, 1 for battery 1 + * Returns 1 if yes, 0 if no, negative if error. + */ +static int bat_has_status(int bat) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + int ret = read_tp_ec_row(1, bat, 0, row); + if (ret) + return ret; + if ((row[0] & (bat?0x20:0x40)) == 0) /* no battery */ + return 0; + if ((row[1] & (0x60)) == 0) /* no status */ + return 0; + return 1; +} + +/** + * get_tp_ec_bat_16 - read a 16-bit value from EC battery status data + * @arg0: first argument to EC + * @off: offset in row returned from EC + * @bat: battery (0 or 1) + * @val: the 16-bit value obtained + * Returns nonzero on error. + */ +static int get_tp_ec_bat_16(u8 arg0, int offset, int bat, u16 *val) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + int ret; + if (bat_has_status(bat) != 1) + return -ENXIO; + ret = read_tp_ec_row(arg0, bat, 0, row); + if (ret) + return ret; + *val = *(u16 *)(row+offset); + return 0; +} + +/********************************************************************* + * sysfs attributes for batteries - + * definitions and helper functions + */ + +/* A custom device attribute struct which holds a battery number */ +struct bat_device_attribute { + struct device_attribute dev_attr; + int bat; +}; + +/** + * attr_get_bat - get the battery to which the attribute belongs + */ +static int attr_get_bat(struct device_attribute *attr) +{ + return container_of(attr, struct bat_device_attribute, dev_attr)->bat; +} + +/** + * show_tp_ec_bat_u16 - show an unsigned 16-bit battery attribute + * @arg0: specified 1st argument of EC raw to read + * @offset: byte offset in EC raw data + * @mul: correction factor to multiply by + * @na_msg: string to output is value not available (0xFFFFFFFF) + * @attr: battery attribute + * @buf: output buffer + * The 16-bit value is read from the EC, treated as unsigned, + * transformed as x->mul*x, and printed to the buffer. + * If the value is 0xFFFFFFFF and na_msg!=%NULL, na_msg is printed instead. + */ +static ssize_t show_tp_ec_bat_u16(u8 arg0, int offset, int mul, + const char *na_msg, + struct device_attribute *attr, char *buf) +{ + u16 val; + int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); + if (ret) + return ret; + if (na_msg && val == 0xFFFF) + return sprintf(buf, "%s\n", na_msg); + else + return sprintf(buf, "%u\n", mul*(unsigned int)val); +} + +/** + * show_tp_ec_bat_s16 - show an signed 16-bit battery attribute + * @arg0: specified 1st argument of EC raw to read + * @offset: byte offset in EC raw data + * @mul: correction factor to multiply by + * @add: correction term to add after multiplication + * @attr: battery attribute + * @buf: output buffer + * The 16-bit value is read from the EC, treated as signed, + * transformed as x->mul*x+add, and printed to the buffer. + */ +static ssize_t show_tp_ec_bat_s16(u8 arg0, int offset, int mul, int add, + struct device_attribute *attr, char *buf) +{ + u16 val; + int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); + if (ret) + return ret; + return sprintf(buf, "%d\n", mul*(s16)val+add); +} + +/** + * show_tp_ec_bat_str - show a string from EC battery status data + * @arg0: specified 1st argument of EC raw to read + * @offset: byte offset in EC raw data + * @maxlen: maximum string length + * @attr: battery attribute + * @buf: output buffer + */ +static ssize_t show_tp_ec_bat_str(u8 arg0, int offset, int maxlen, + struct device_attribute *attr, char *buf) +{ + int bat = attr_get_bat(attr); + u8 row[TP_CONTROLLER_ROW_LEN]; + int ret; + if (bat_has_status(bat) != 1) + return -ENXIO; + ret = read_tp_ec_row(arg0, bat, 0, row); + if (ret) + return ret; + strncpy(buf, (char *)row+offset, maxlen); + buf[maxlen] = 0; + strcat(buf, "\n"); + return strlen(buf); +} + +/** + * show_tp_ec_bat_power - show a power readout from EC battery status data + * @arg0: specified 1st argument of EC raw to read + * @offV: byte offset of voltage in EC raw data + * @offI: byte offset of current in EC raw data + * @attr: battery attribute + * @buf: output buffer + * Computes the power as current*voltage from the two given readout offsets. + */ +static ssize_t show_tp_ec_bat_power(u8 arg0, int offV, int offI, + struct device_attribute *attr, char *buf) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + int milliamp, millivolt, ret; + int bat = attr_get_bat(attr); + if (bat_has_status(bat) != 1) + return -ENXIO; + ret = read_tp_ec_row(1, bat, 0, row); + if (ret) + return ret; + millivolt = *(u16 *)(row+offV); + milliamp = *(s16 *)(row+offI); + return sprintf(buf, "%d\n", milliamp*millivolt/1000); /* units: mW */ +} + +/** + * show_tp_ec_bat_date - decode and show a date from EC battery status data + * @arg0: specified 1st argument of EC raw to read + * @offset: byte offset in EC raw data + * @attr: battery attribute + * @buf: output buffer + */ +static ssize_t show_tp_ec_bat_date(u8 arg0, int offset, + struct device_attribute *attr, char *buf) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + u16 v; + int ret; + int day, month, year; + int bat = attr_get_bat(attr); + if (bat_has_status(bat) != 1) + return -ENXIO; + ret = read_tp_ec_row(arg0, bat, 0, row); + if (ret) + return ret; + + /* Decode bit-packed: v = day | (month<<5) | ((year-1980)<<9) */ + v = *(u16 *)(row+offset); + day = v & 0x1F; + month = (v >> 5) & 0xF; + year = (v >> 9) + 1980; + + return sprintf(buf, "%04d-%02d-%02d\n", year, month, day); +} + + +/********************************************************************* + * sysfs attribute I/O for batteries - + * the actual attribute show/store functions + */ + +static ssize_t show_battery_start_charge_thresh(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int thresh; + int bat = attr_get_bat(attr); + int ret = get_thresh(bat, THRESH_START, &thresh); + if (ret) + return ret; + return sprintf(buf, "%d\n", thresh); /* units: percent */ +} + +static ssize_t show_battery_stop_charge_thresh(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int thresh; + int bat = attr_get_bat(attr); + int ret = get_thresh(bat, THRESH_STOP, &thresh); + if (ret) + return ret; + return sprintf(buf, "%d\n", thresh); /* units: percent */ +} + +/** + * store_battery_start_charge_thresh - store battery_start_charge_thresh attr + * Since this is a kernel<->user interface, we ensure a valid state for + * the hardware. We do this by clamping the requested threshold to the + * valid range and, if necessary, moving the other threshold so that + * it's MIN_THRESH_DELTA away from this one. + */ +static ssize_t store_battery_start_charge_thresh(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int thresh, other_thresh, ret; + int bat = attr_get_bat(attr); + + if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) + return -EINVAL; + + if (thresh < MIN_THRESH_START) /* clamp up to MIN_THRESH_START */ + thresh = MIN_THRESH_START; + if (thresh > MAX_THRESH_START) /* clamp down to MAX_THRESH_START */ + thresh = MAX_THRESH_START; + + down(&smapi_mutex); + ret = get_thresh(bat, THRESH_STOP, &other_thresh); + if (ret != -EOPNOTSUPP && ret != -ENXIO) { + if (ret) /* other threshold is set? */ + goto out; + ret = get_real_thresh(bat, THRESH_START, NULL); + if (ret) /* this threshold is set? */ + goto out; + if (other_thresh < thresh+MIN_THRESH_DELTA) { + /* move other thresh to keep it above this one */ + ret = set_thresh(bat, THRESH_STOP, + thresh+MIN_THRESH_DELTA); + if (ret) + goto out; + } + } + ret = set_thresh(bat, THRESH_START, thresh); +out: + up(&smapi_mutex); + return count; + +} + +/** + * store_battery_stop_charge_thresh - store battery_stop_charge_thresh attr + * Since this is a kernel<->user interface, we ensure a valid state for + * the hardware. We do this by clamping the requested threshold to the + * valid range and, if necessary, moving the other threshold so that + * it's MIN_THRESH_DELTA away from this one. + */ +static ssize_t store_battery_stop_charge_thresh(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int thresh, other_thresh, ret; + int bat = attr_get_bat(attr); + + if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) + return -EINVAL; + + if (thresh < MIN_THRESH_STOP) /* clamp up to MIN_THRESH_STOP */ + thresh = MIN_THRESH_STOP; + + down(&smapi_mutex); + ret = get_thresh(bat, THRESH_START, &other_thresh); + if (ret != -EOPNOTSUPP && ret != -ENXIO) { /* other threshold exists? */ + if (ret) + goto out; + /* this threshold exists? */ + ret = get_real_thresh(bat, THRESH_STOP, NULL); + if (ret) + goto out; + if (other_thresh >= thresh-MIN_THRESH_DELTA) { + /* move other thresh to be below this one */ + ret = set_thresh(bat, THRESH_START, + thresh-MIN_THRESH_DELTA); + if (ret) + goto out; + } + } + ret = set_thresh(bat, THRESH_STOP, thresh); +out: + up(&smapi_mutex); + return count; +} + +static ssize_t show_battery_inhibit_charge_minutes(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int minutes; + int bat = attr_get_bat(attr); + int ret = get_inhibit_charge_minutes(bat, &minutes); + if (ret) + return ret; + return sprintf(buf, "%d\n", minutes); /* units: minutes */ +} + +static ssize_t store_battery_inhibit_charge_minutes(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + int minutes; + int bat = attr_get_bat(attr); + if (sscanf(buf, "%d", &minutes) != 1 || minutes < 0) { + TPRINTK(KERN_ERR, "inhibit_charge_minutes: " + "must be a non-negative integer"); + return -EINVAL; + } + ret = set_inhibit_charge_minutes(bat, minutes); + if (ret) + return ret; + return count; +} + +static ssize_t show_battery_force_discharge(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int enabled; + int bat = attr_get_bat(attr); + int ret = get_force_discharge(bat, &enabled); + if (ret) + return ret; + return sprintf(buf, "%d\n", enabled); /* type: boolean */ +} + +static ssize_t store_battery_force_discharge(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int ret; + int enabled; + int bat = attr_get_bat(attr); + if (sscanf(buf, "%d", &enabled) != 1 || enabled < 0 || enabled > 1) + return -EINVAL; + ret = set_force_discharge(bat, enabled); + if (ret) + return ret; + return count; +} + +static ssize_t show_battery_installed( + struct device *dev, struct device_attribute *attr, char *buf) +{ + int bat = attr_get_bat(attr); + int ret = power_device_present(bat); + if (ret < 0) + return ret; + return sprintf(buf, "%d\n", ret); /* type: boolean */ +} + +static ssize_t show_battery_state( + struct device *dev, struct device_attribute *attr, char *buf) +{ + u8 row[TP_CONTROLLER_ROW_LEN]; + const char *txt; + int ret; + int bat = attr_get_bat(attr); + if (bat_has_status(bat) != 1) + return sprintf(buf, "none\n"); + ret = read_tp_ec_row(1, bat, 0, row); + if (ret) + return ret; + switch (row[1] & 0xf0) { + case 0xc0: txt = "idle"; break; + case 0xd0: txt = "discharging"; break; + case 0xe0: txt = "charging"; break; + default: return sprintf(buf, "unknown (0x%x)\n", row[1]); + } + return sprintf(buf, "%s\n", txt); /* type: string from fixed set */ +} + +static ssize_t show_battery_manufacturer( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: string. SBS spec v1.1 p34: ManufacturerName() */ + return show_tp_ec_bat_str(4, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); +} + +static ssize_t show_battery_model( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: string. SBS spec v1.1 p34: DeviceName() */ + return show_tp_ec_bat_str(5, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); +} + +static ssize_t show_battery_barcoding( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: string */ + return show_tp_ec_bat_str(7, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); +} + +static ssize_t show_battery_chemistry( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: string. SBS spec v1.1 p34-35: DeviceChemistry() */ + return show_tp_ec_bat_str(6, 2, 5, attr, buf); +} + +static ssize_t show_battery_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV. SBS spec v1.1 p24: Voltage() */ + return show_tp_ec_bat_u16(1, 6, 1, NULL, attr, buf); +} + +static ssize_t show_battery_design_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV. SBS spec v1.1 p32: DesignVoltage() */ + return show_tp_ec_bat_u16(3, 4, 1, NULL, attr, buf); +} + +static ssize_t show_battery_charging_max_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV. SBS spec v1.1 p37,39: ChargingVoltage() */ + return show_tp_ec_bat_u16(9, 8, 1, NULL, attr, buf); +} + +static ssize_t show_battery_group0_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV */ + return show_tp_ec_bat_u16(0xA, 12, 1, NULL, attr, buf); +} + +static ssize_t show_battery_group1_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV */ + return show_tp_ec_bat_u16(0xA, 10, 1, NULL, attr, buf); +} + +static ssize_t show_battery_group2_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV */ + return show_tp_ec_bat_u16(0xA, 8, 1, NULL, attr, buf); +} + +static ssize_t show_battery_group3_voltage( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mV */ + return show_tp_ec_bat_u16(0xA, 6, 1, NULL, attr, buf); +} + +static ssize_t show_battery_current_now( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mA. SBS spec v1.1 p24: Current() */ + return show_tp_ec_bat_s16(1, 8, 1, 0, attr, buf); +} + +static ssize_t show_battery_current_avg( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mA. SBS spec v1.1 p24: AverageCurrent() */ + return show_tp_ec_bat_s16(1, 10, 1, 0, attr, buf); +} + +static ssize_t show_battery_charging_max_current( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mA. SBS spec v1.1 p36,38: ChargingCurrent() */ + return show_tp_ec_bat_s16(9, 6, 1, 0, attr, buf); +} + +static ssize_t show_battery_power_now( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mW. SBS spec v1.1: Voltage()*Current() */ + return show_tp_ec_bat_power(1, 6, 8, attr, buf); +} + +static ssize_t show_battery_power_avg( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mW. SBS spec v1.1: Voltage()*AverageCurrent() */ + return show_tp_ec_bat_power(1, 6, 10, attr, buf); +} + +static ssize_t show_battery_remaining_percent( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: percent. SBS spec v1.1 p25: RelativeStateOfCharge() */ + return show_tp_ec_bat_u16(1, 12, 1, NULL, attr, buf); +} + +static ssize_t show_battery_remaining_percent_error( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: percent. SBS spec v1.1 p25: MaxError() */ + return show_tp_ec_bat_u16(9, 4, 1, NULL, attr, buf); +} + +static ssize_t show_battery_remaining_charging_time( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: minutes. SBS spec v1.1 p27: AverageTimeToFull() */ + return show_tp_ec_bat_u16(2, 8, 1, "not_charging", attr, buf); +} + +static ssize_t show_battery_remaining_running_time( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ + return show_tp_ec_bat_u16(2, 6, 1, "not_discharging", attr, buf); +} + +static ssize_t show_battery_remaining_running_time_now( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ + return show_tp_ec_bat_u16(2, 4, 1, "not_discharging", attr, buf); +} + +static ssize_t show_battery_remaining_capacity( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mWh. SBS spec v1.1 p26. */ + return show_tp_ec_bat_u16(1, 14, 10, "", attr, buf); +} + +static ssize_t show_battery_last_full_capacity( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mWh. SBS spec v1.1 p26: FullChargeCapacity() */ + return show_tp_ec_bat_u16(2, 2, 10, "", attr, buf); +} + +static ssize_t show_battery_design_capacity( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: mWh. SBS spec v1.1 p32: DesignCapacity() */ + return show_tp_ec_bat_u16(3, 2, 10, "", attr, buf); +} + +static ssize_t show_battery_cycle_count( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: ordinal. SBS spec v1.1 p32: CycleCount() */ + return show_tp_ec_bat_u16(2, 12, 1, "", attr, buf); +} + +static ssize_t show_battery_temperature( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* units: millicelsius. SBS spec v1.1: Temperature()*10 */ + return show_tp_ec_bat_s16(1, 4, 100, -273100, attr, buf); +} + +static ssize_t show_battery_serial( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: int. SBS spec v1.1 p34: SerialNumber() */ + return show_tp_ec_bat_u16(3, 10, 1, "", attr, buf); +} + +static ssize_t show_battery_manufacture_date( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: YYYY-MM-DD. SBS spec v1.1 p34: ManufactureDate() */ + return show_tp_ec_bat_date(3, 8, attr, buf); +} + +static ssize_t show_battery_first_use_date( + struct device *dev, struct device_attribute *attr, char *buf) +{ + /* type: YYYY-MM-DD */ + return show_tp_ec_bat_date(8, 2, attr, buf); +} + +/** + * show_battery_dump - show the battery's dump attribute + * The dump attribute gives a hex dump of all EC readouts related to a + * battery. Some of the enumerated values don't really exist (i.e., the + * EC function just leaves them untouched); we use a kludge to detect and + * denote these. + */ +#define MIN_DUMP_ARG0 0x00 +#define MAX_DUMP_ARG0 0x0a /* 0x0b is useful too but hangs old EC firmware */ +static ssize_t show_battery_dump( + struct device *dev, struct device_attribute *attr, char *buf) +{ + int i; + char *p = buf; + int bat = attr_get_bat(attr); + u8 arg0; /* first argument to EC */ + u8 rowa[TP_CONTROLLER_ROW_LEN], + rowb[TP_CONTROLLER_ROW_LEN]; + const u8 junka = 0xAA, + junkb = 0x55; /* junk values for testing changes */ + int ret; + + for (arg0 = MIN_DUMP_ARG0; arg0 <= MAX_DUMP_ARG0; ++arg0) { + if ((p-buf) > PAGE_SIZE-TP_CONTROLLER_ROW_LEN*5) + return -ENOMEM; /* don't overflow sysfs buf */ + /* Read raw twice with different junk values, + * to detect unused output bytes which are left unchaged: */ + ret = read_tp_ec_row(arg0, bat, junka, rowa); + if (ret) + return ret; + ret = read_tp_ec_row(arg0, bat, junkb, rowb); + if (ret) + return ret; + for (i = 0; i < TP_CONTROLLER_ROW_LEN; i++) { + if (rowa[i] == junka && rowb[i] == junkb) + p += sprintf(p, "-- "); /* unused by EC */ + else + p += sprintf(p, "%02x ", rowa[i]); + } + p += sprintf(p, "\n"); + } + return p-buf; +} + + +/********************************************************************* + * sysfs attribute I/O, other than batteries + */ + +static ssize_t show_ac_connected( + struct device *dev, struct device_attribute *attr, char *buf) +{ + int ret = power_device_present(0xFF); + if (ret < 0) + return ret; + return sprintf(buf, "%d\n", ret); /* type: boolean */ +} + +/********************************************************************* + * The the "smapi_request" sysfs attribute executes a raw SMAPI call. + * You write to make a request and read to get the result. The state + * is saved globally rather than per fd (sysfs limitation), so + * simultaenous requests may get each other's results! So this is for + * development and debugging only. + */ +#define MAX_SMAPI_ATTR_ANSWER_LEN 128 +static char smapi_attr_answer[MAX_SMAPI_ATTR_ANSWER_LEN] = ""; + +static ssize_t show_smapi_request(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int ret = snprintf(buf, PAGE_SIZE, "%s", smapi_attr_answer); + smapi_attr_answer[0] = '\0'; + return ret; +} + +static ssize_t store_smapi_request(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int inEBX, inECX, inEDI, inESI; + u32 outEBX, outECX, outEDX, outEDI, outESI; + const char *msg; + int ret; + if (sscanf(buf, "%x %x %x %x", &inEBX, &inECX, &inEDI, &inESI) != 4) { + smapi_attr_answer[0] = '\0'; + return -EINVAL; + } + ret = smapi_request( + inEBX, inECX, inEDI, inESI, + &outEBX, &outECX, &outEDX, &outEDI, &outESI, &msg); + snprintf(smapi_attr_answer, MAX_SMAPI_ATTR_ANSWER_LEN, + "%x %x %x %x %x %d '%s'\n", + (unsigned int)outEBX, (unsigned int)outECX, + (unsigned int)outEDX, (unsigned int)outEDI, + (unsigned int)outESI, ret, msg); + if (ret) + return ret; + else + return count; +} + +/********************************************************************* + * Power management: the embedded controller forgets the battery + * thresholds when the system is suspended to disk and unplugged from + * AC and battery, so we restore it upon resume. + */ + +static int saved_threshs[4] = {-1, -1, -1, -1}; /* -1 = don't know */ + +static int tp_suspend(struct platform_device *dev, pm_message_t state) +{ + int restore = (state.event == PM_EVENT_HIBERNATE || + state.event == PM_EVENT_FREEZE); + if (!restore || get_real_thresh(0, THRESH_STOP , &saved_threshs[0])) + saved_threshs[0] = -1; + if (!restore || get_real_thresh(0, THRESH_START, &saved_threshs[1])) + saved_threshs[1] = -1; + if (!restore || get_real_thresh(1, THRESH_STOP , &saved_threshs[2])) + saved_threshs[2] = -1; + if (!restore || get_real_thresh(1, THRESH_START, &saved_threshs[3])) + saved_threshs[3] = -1; + DPRINTK("suspend saved: %d %d %d %d", saved_threshs[0], + saved_threshs[1], saved_threshs[2], saved_threshs[3]); + return 0; +} + +static int tp_resume(struct platform_device *dev) +{ + DPRINTK("resume restoring: %d %d %d %d", saved_threshs[0], + saved_threshs[1], saved_threshs[2], saved_threshs[3]); + if (saved_threshs[0] >= 0) + set_real_thresh(0, THRESH_STOP , saved_threshs[0]); + if (saved_threshs[1] >= 0) + set_real_thresh(0, THRESH_START, saved_threshs[1]); + if (saved_threshs[2] >= 0) + set_real_thresh(1, THRESH_STOP , saved_threshs[2]); + if (saved_threshs[3] >= 0) + set_real_thresh(1, THRESH_START, saved_threshs[3]); + return 0; +} + + +/********************************************************************* + * Driver model + */ + +static struct platform_driver tp_driver = { + .suspend = tp_suspend, + .resume = tp_resume, + .driver = { + .name = "smapi", + .owner = THIS_MODULE + }, +}; + + +/********************************************************************* + * Sysfs device model + */ + +/* Attributes in /sys/devices/platform/smapi/ */ + +static DEVICE_ATTR(ac_connected, 0444, show_ac_connected, NULL); +static DEVICE_ATTR(smapi_request, 0600, show_smapi_request, + store_smapi_request); + +static struct attribute *tp_root_attributes[] = { + &dev_attr_ac_connected.attr, + &dev_attr_smapi_request.attr, + NULL +}; +static struct attribute_group tp_root_attribute_group = { + .attrs = tp_root_attributes +}; + +/* Attributes under /sys/devices/platform/smapi/BAT{0,1}/ : + * Every attribute needs to be defined (i.e., statically allocated) for + * each battery, and then referenced in the attribute list of each battery. + * We use preprocessor voodoo to avoid duplicating the list of attributes 4 + * times. The preprocessor output is just normal sysfs attributes code. + */ + +/** + * FOREACH_BAT_ATTR - invoke the given macros on all our battery attributes + * @_BAT: battery number (0 or 1) + * @_ATTR_RW: macro to invoke for each read/write attribute + * @_ATTR_R: macro to invoke for each read-only attribute + */ +#define FOREACH_BAT_ATTR(_BAT, _ATTR_RW, _ATTR_R) \ + _ATTR_RW(_BAT, start_charge_thresh) \ + _ATTR_RW(_BAT, stop_charge_thresh) \ + _ATTR_RW(_BAT, inhibit_charge_minutes) \ + _ATTR_RW(_BAT, force_discharge) \ + _ATTR_R(_BAT, installed) \ + _ATTR_R(_BAT, state) \ + _ATTR_R(_BAT, manufacturer) \ + _ATTR_R(_BAT, model) \ + _ATTR_R(_BAT, barcoding) \ + _ATTR_R(_BAT, chemistry) \ + _ATTR_R(_BAT, voltage) \ + _ATTR_R(_BAT, group0_voltage) \ + _ATTR_R(_BAT, group1_voltage) \ + _ATTR_R(_BAT, group2_voltage) \ + _ATTR_R(_BAT, group3_voltage) \ + _ATTR_R(_BAT, current_now) \ + _ATTR_R(_BAT, current_avg) \ + _ATTR_R(_BAT, charging_max_current) \ + _ATTR_R(_BAT, power_now) \ + _ATTR_R(_BAT, power_avg) \ + _ATTR_R(_BAT, remaining_percent) \ + _ATTR_R(_BAT, remaining_percent_error) \ + _ATTR_R(_BAT, remaining_charging_time) \ + _ATTR_R(_BAT, remaining_running_time) \ + _ATTR_R(_BAT, remaining_running_time_now) \ + _ATTR_R(_BAT, remaining_capacity) \ + _ATTR_R(_BAT, last_full_capacity) \ + _ATTR_R(_BAT, design_voltage) \ + _ATTR_R(_BAT, charging_max_voltage) \ + _ATTR_R(_BAT, design_capacity) \ + _ATTR_R(_BAT, cycle_count) \ + _ATTR_R(_BAT, temperature) \ + _ATTR_R(_BAT, serial) \ + _ATTR_R(_BAT, manufacture_date) \ + _ATTR_R(_BAT, first_use_date) \ + _ATTR_R(_BAT, dump) + +/* Define several macros we will feed into FOREACH_BAT_ATTR: */ + +#define DEFINE_BAT_ATTR_RW(_BAT,_NAME) \ + static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ + .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, \ + store_battery_##_NAME), \ + .bat = _BAT \ + }; + +#define DEFINE_BAT_ATTR_R(_BAT,_NAME) \ + static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ + .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, 0), \ + .bat = _BAT \ + }; + +#define REF_BAT_ATTR(_BAT,_NAME) \ + &dev_attr_##_NAME##_##_BAT.dev_attr.attr, + +/* This provide all attributes for one battery: */ + +#define PROVIDE_BAT_ATTRS(_BAT) \ + FOREACH_BAT_ATTR(_BAT, DEFINE_BAT_ATTR_RW, DEFINE_BAT_ATTR_R) \ + static struct attribute *tp_bat##_BAT##_attributes[] = { \ + FOREACH_BAT_ATTR(_BAT, REF_BAT_ATTR, REF_BAT_ATTR) \ + NULL \ + }; \ + static struct attribute_group tp_bat##_BAT##_attribute_group = { \ + .name = "BAT" #_BAT, \ + .attrs = tp_bat##_BAT##_attributes \ + }; + +/* Finally genereate the attributes: */ + +PROVIDE_BAT_ATTRS(0) +PROVIDE_BAT_ATTRS(1) + +/* List of attribute groups */ + +static struct attribute_group *attr_groups[] = { + &tp_root_attribute_group, + &tp_bat0_attribute_group, + &tp_bat1_attribute_group, + NULL +}; + + +/********************************************************************* + * Init and cleanup + */ + +static struct attribute_group **next_attr_group; /* next to register */ + +static int __init tp_init(void) +{ + int ret; + printk(KERN_INFO "tp_smapi " TP_VERSION " loading...\n"); + + ret = find_smapi_port(); + if (ret < 0) + goto err; + else + smapi_port = ret; + + if (!request_region(smapi_port, 1, "smapi")) { + printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", + smapi_port); + ret = -ENXIO; + goto err; + } + + if (!request_region(SMAPI_PORT2, 1, "smapi")) { + printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", + SMAPI_PORT2); + ret = -ENXIO; + goto err_port1; + } + + ret = platform_driver_register(&tp_driver); + if (ret) + goto err_port2; + + pdev = platform_device_alloc("smapi", -1); + if (!pdev) { + ret = -ENOMEM; + goto err_driver; + } + + ret = platform_device_add(pdev); + if (ret) + goto err_device_free; + + for (next_attr_group = attr_groups; *next_attr_group; + ++next_attr_group) { + ret = sysfs_create_group(&pdev->dev.kobj, *next_attr_group); + if (ret) + goto err_attr; + } + + printk(KERN_INFO "tp_smapi successfully loaded (smapi_port=0x%x).\n", + smapi_port); + return 0; + +err_attr: + while (--next_attr_group >= attr_groups) + sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); + platform_device_unregister(pdev); +err_device_free: + platform_device_put(pdev); +err_driver: + platform_driver_unregister(&tp_driver); +err_port2: + release_region(SMAPI_PORT2, 1); +err_port1: + release_region(smapi_port, 1); +err: + printk(KERN_ERR "tp_smapi init failed (ret=%d)!\n", ret); + return ret; +} + +static void __exit tp_exit(void) +{ + while (next_attr_group && --next_attr_group >= attr_groups) + sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); + platform_device_unregister(pdev); + platform_driver_unregister(&tp_driver); + release_region(SMAPI_PORT2, 1); + if (smapi_port) + release_region(smapi_port, 1); + + printk(KERN_INFO "tp_smapi unloaded.\n"); +} + +module_init(tp_init); +module_exit(tp_exit); diff -Nur linux-4.2/drivers/staging/Kconfig linux-4.2-pck/drivers/staging/Kconfig --- linux-4.2/drivers/staging/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/staging/Kconfig 2015-09-08 00:48:22.225030705 -0300 @@ -112,4 +112,6 @@ source "drivers/staging/wilc1000/Kconfig" +source "drivers/staging/vhba/Kconfig" + endif # STAGING diff -Nur linux-4.2/drivers/staging/Makefile linux-4.2-pck/drivers/staging/Makefile --- linux-4.2/drivers/staging/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/staging/Makefile 2015-09-08 00:48:22.225030705 -0300 @@ -25,6 +25,7 @@ obj-$(CONFIG_FB_SM7XX) += sm7xxfb/ obj-$(CONFIG_FB_SM750) += sm750fb/ obj-$(CONFIG_FB_XGI) += xgifb/ +obj-$(CONFIG_VHBA) += vhba/ obj-$(CONFIG_USB_EMXX) += emxx_udc/ obj-$(CONFIG_FT1000) += ft1000/ obj-$(CONFIG_SPEAKUP) += speakup/ diff -Nur linux-4.2/drivers/staging/android/ashmem.c linux-4.2-pck/drivers/staging/android/ashmem.c --- linux-4.2/drivers/staging/android/ashmem.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/staging/android/ashmem.c 2015-09-08 11:20:32.140361257 -0300 @@ -387,7 +387,7 @@ name = asma->name; /* ... and allocate the backing shmem file */ - vmfile = shmem_file_setup(name, asma->size, vma->vm_flags); + vmfile = shmem_file_setup(name, asma->size, vma->vm_flags, 0); if (unlikely(IS_ERR(vmfile))) { ret = PTR_ERR(vmfile); goto out; diff -Nur linux-4.2/drivers/staging/vhba/Kconfig linux-4.2-pck/drivers/staging/vhba/Kconfig --- linux-4.2/drivers/staging/vhba/Kconfig 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/drivers/staging/vhba/Kconfig 2015-09-08 00:48:22.225030705 -0300 @@ -0,0 +1,9 @@ +config VHBA + tristate "Virtual (SCSI) Host Bus Adapter" + depends on SCSI + ---help--- + This is the in-kernel part of CDEmu, a CD/DVD-ROM device + emulator. + + This driver can also be built as a module. If so, the module + will be called vhba. diff -Nur linux-4.2/drivers/staging/vhba/Makefile linux-4.2-pck/drivers/staging/vhba/Makefile --- linux-4.2/drivers/staging/vhba/Makefile 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/drivers/staging/vhba/Makefile 2015-09-08 00:48:22.225030705 -0300 @@ -0,0 +1,4 @@ +VHBA_VERSION := 20140928 + +obj-$(CONFIG_VHBA) += vhba.o +ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror diff -Nur linux-4.2/drivers/staging/vhba/vhba.c linux-4.2-pck/drivers/staging/vhba/vhba.c --- linux-4.2/drivers/staging/vhba/vhba.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/drivers/staging/vhba/vhba.c 2015-09-08 00:48:22.228363880 -0300 @@ -0,0 +1,1071 @@ +/* + * vhba.c + * + * Copyright (C) 2007-2012 Chia-I Wu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_COMPAT +#include +#endif +#include +#include +#include +#include +#include + +/* scatterlist.page_link and sg_page() were introduced in 2.6.24 */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24) +#define USE_SG_PAGE +#include +#endif + +MODULE_AUTHOR("Chia-I Wu"); +MODULE_VERSION(VHBA_VERSION); +MODULE_DESCRIPTION("Virtual SCSI HBA"); +MODULE_LICENSE("GPL"); + +#ifdef DEBUG +#define DPRINTK(fmt, args...) printk(KERN_DEBUG "%s: " fmt, __FUNCTION__, ## args) +#else +#define DPRINTK(fmt, args...) +#endif + +/* scmd_dbg was introduced in 3.15 */ +#ifndef scmd_dbg +#define scmd_dbg(scmd, fmt, a...) \ + dev_dbg(&(scmd)->device->sdev_gendev, fmt, ##a) +#endif + +#ifndef scmd_warn +#define scmd_warn(scmd, fmt, a...) \ + dev_warn(&(scmd)->device->sdev_gendev, fmt, ##a) +#endif + +#define VHBA_MAX_SECTORS_PER_IO 256 +#define VHBA_MAX_ID 32 +#define VHBA_CAN_QUEUE 32 +#define VHBA_INVALID_ID VHBA_MAX_ID + +#define DATA_TO_DEVICE(dir) ((dir) == DMA_TO_DEVICE || (dir) == DMA_BIDIRECTIONAL) +#define DATA_FROM_DEVICE(dir) ((dir) == DMA_FROM_DEVICE || (dir) == DMA_BIDIRECTIONAL) + + +/* SCSI macros were introduced in 2.6.23 */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23) +#define scsi_sg_count(cmd) ((cmd)->use_sg) +#define scsi_sglist(cmd) ((cmd)->request_buffer) +#define scsi_bufflen(cmd) ((cmd)->request_bufflen) +#define scsi_set_resid(cmd, to_read) {(cmd)->resid = (to_read);} +#endif + +/* 1-argument form of k[un]map_atomic was introduced in 2.6.37-rc1; + 2-argument form was deprecated in 3.4-rc1 */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 37) +#define vhba_kmap_atomic kmap_atomic +#define vhba_kunmap_atomic kunmap_atomic +#else +#define vhba_kmap_atomic(page) kmap_atomic(page, KM_USER0) +#define vhba_kunmap_atomic(page) kunmap_atomic(page, KM_USER0) +#endif + + +enum vhba_req_state { + VHBA_REQ_FREE, + VHBA_REQ_PENDING, + VHBA_REQ_READING, + VHBA_REQ_SENT, + VHBA_REQ_WRITING, +}; + +struct vhba_command { + struct scsi_cmnd *cmd; + int status; + struct list_head entry; +}; + +struct vhba_device { + uint id; + spinlock_t cmd_lock; + struct list_head cmd_list; + wait_queue_head_t cmd_wq; + atomic_t refcnt; +}; + +struct vhba_host { + struct Scsi_Host *shost; + spinlock_t cmd_lock; + int cmd_next; + struct vhba_command commands[VHBA_CAN_QUEUE]; + spinlock_t dev_lock; + struct vhba_device *devices[VHBA_MAX_ID]; + int num_devices; + DECLARE_BITMAP(chgmap, VHBA_MAX_ID); + int chgtype[VHBA_MAX_ID]; + struct work_struct scan_devices; +}; + +#define MAX_COMMAND_SIZE 16 + +struct vhba_request { + __u32 tag; + __u32 lun; + __u8 cdb[MAX_COMMAND_SIZE]; + __u8 cdb_len; + __u32 data_len; +}; + +struct vhba_response { + __u32 tag; + __u32 status; + __u32 data_len; +}; + +static struct vhba_command *vhba_alloc_command (void); +static void vhba_free_command (struct vhba_command *vcmd); + +static struct platform_device vhba_platform_device; + +static struct vhba_device *vhba_device_alloc (void) +{ + struct vhba_device *vdev; + + vdev = kzalloc(sizeof(struct vhba_device), GFP_KERNEL); + if (!vdev) { + return NULL; + } + + vdev->id = VHBA_INVALID_ID; + spin_lock_init(&vdev->cmd_lock); + INIT_LIST_HEAD(&vdev->cmd_list); + init_waitqueue_head(&vdev->cmd_wq); + atomic_set(&vdev->refcnt, 1); + + return vdev; +} + +static void vhba_device_put (struct vhba_device *vdev) +{ + if (atomic_dec_and_test(&vdev->refcnt)) { + kfree(vdev); + } +} + +static struct vhba_device *vhba_device_get (struct vhba_device *vdev) +{ + atomic_inc(&vdev->refcnt); + + return vdev; +} + +static int vhba_device_queue (struct vhba_device *vdev, struct scsi_cmnd *cmd) +{ + struct vhba_command *vcmd; + unsigned long flags; + + vcmd = vhba_alloc_command(); + if (!vcmd) { + return SCSI_MLQUEUE_HOST_BUSY; + } + + vcmd->cmd = cmd; + + spin_lock_irqsave(&vdev->cmd_lock, flags); + list_add_tail(&vcmd->entry, &vdev->cmd_list); + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + wake_up_interruptible(&vdev->cmd_wq); + + return 0; +} + +static int vhba_device_dequeue (struct vhba_device *vdev, struct scsi_cmnd *cmd) +{ + struct vhba_command *vcmd; + int retval; + unsigned long flags; + + spin_lock_irqsave(&vdev->cmd_lock, flags); + list_for_each_entry(vcmd, &vdev->cmd_list, entry) { + if (vcmd->cmd == cmd) { + list_del_init(&vcmd->entry); + break; + } + } + + /* command not found */ + if (&vcmd->entry == &vdev->cmd_list) { + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + return SUCCESS; + } + + while (vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING) { + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + scmd_dbg(cmd, "wait for I/O before aborting\n"); + schedule_timeout(1); + spin_lock_irqsave(&vdev->cmd_lock, flags); + } + + retval = (vcmd->status == VHBA_REQ_SENT) ? FAILED : SUCCESS; + + vhba_free_command(vcmd); + + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + return retval; +} + +static inline void vhba_scan_devices_add (struct vhba_host *vhost, int id) +{ + struct scsi_device *sdev; + + sdev = scsi_device_lookup(vhost->shost, 0, id, 0); + if (!sdev) { + scsi_add_device(vhost->shost, 0, id, 0); + } else { + dev_warn(&vhost->shost->shost_gendev, "tried to add an already-existing device 0:%d:0!\n", id); + scsi_device_put(sdev); + } +} + +static inline void vhba_scan_devices_remove (struct vhba_host *vhost, int id) +{ + struct scsi_device *sdev; + + sdev = scsi_device_lookup(vhost->shost, 0, id, 0); + if (sdev) { + scsi_remove_device(sdev); + scsi_device_put(sdev); + } else { + dev_warn(&vhost->shost->shost_gendev, "tried to remove non-existing device 0:%d:0!\n", id); + } +} + +static void vhba_scan_devices (struct work_struct *work) +{ + struct vhba_host *vhost = container_of(work, struct vhba_host, scan_devices); + unsigned long flags; + int id, change, exists; + + while (1) { + spin_lock_irqsave(&vhost->dev_lock, flags); + + id = find_first_bit(vhost->chgmap, vhost->shost->max_id); + if (id >= vhost->shost->max_id) { + spin_unlock_irqrestore(&vhost->dev_lock, flags); + break; + } + change = vhost->chgtype[id]; + exists = vhost->devices[id] != NULL; + + vhost->chgtype[id] = 0; + clear_bit(id, vhost->chgmap); + + spin_unlock_irqrestore(&vhost->dev_lock, flags); + + if (change < 0) { + dev_dbg(&vhost->shost->shost_gendev, "trying to remove target 0:%d:0\n", id); + vhba_scan_devices_remove(vhost, id); + } else if (change > 0) { + dev_dbg(&vhost->shost->shost_gendev, "trying to add target 0:%d:0\n", id); + vhba_scan_devices_add(vhost, id); + } else { + /* quick sequence of add/remove or remove/add; we determine + which one it was by checking if device structure exists */ + if (exists) { + /* remove followed by add: remove and (re)add */ + dev_dbg(&vhost->shost->shost_gendev, "trying to (re)add target 0:%d:0\n", id); + vhba_scan_devices_remove(vhost, id); + vhba_scan_devices_add(vhost, id); + } else { + /* add followed by remove: no-op */ + dev_dbg(&vhost->shost->shost_gendev, "no-op for target 0:%d:0\n", id); + } + } + } +} + +static int vhba_add_device (struct vhba_device *vdev) +{ + struct vhba_host *vhost; + int i; + unsigned long flags; + + vhost = platform_get_drvdata(&vhba_platform_device); + + vhba_device_get(vdev); + + spin_lock_irqsave(&vhost->dev_lock, flags); + if (vhost->num_devices >= vhost->shost->max_id) { + spin_unlock_irqrestore(&vhost->dev_lock, flags); + vhba_device_put(vdev); + return -EBUSY; + } + + for (i = 0; i < vhost->shost->max_id; i++) { + if (vhost->devices[i] == NULL) { + vdev->id = i; + vhost->devices[i] = vdev; + vhost->num_devices++; + set_bit(vdev->id, vhost->chgmap); + vhost->chgtype[vdev->id]++; + break; + } + } + spin_unlock_irqrestore(&vhost->dev_lock, flags); + + schedule_work(&vhost->scan_devices); + + return 0; +} + +static int vhba_remove_device (struct vhba_device *vdev) +{ + struct vhba_host *vhost; + unsigned long flags; + + vhost = platform_get_drvdata(&vhba_platform_device); + + spin_lock_irqsave(&vhost->dev_lock, flags); + set_bit(vdev->id, vhost->chgmap); + vhost->chgtype[vdev->id]--; + vhost->devices[vdev->id] = NULL; + vhost->num_devices--; + vdev->id = VHBA_INVALID_ID; + spin_unlock_irqrestore(&vhost->dev_lock, flags); + + vhba_device_put(vdev); + + schedule_work(&vhost->scan_devices); + + return 0; +} + +static struct vhba_device *vhba_lookup_device (int id) +{ + struct vhba_host *vhost; + struct vhba_device *vdev = NULL; + unsigned long flags; + + vhost = platform_get_drvdata(&vhba_platform_device); + + if (likely(id < vhost->shost->max_id)) { + spin_lock_irqsave(&vhost->dev_lock, flags); + vdev = vhost->devices[id]; + if (vdev) { + vdev = vhba_device_get(vdev); + } + + spin_unlock_irqrestore(&vhost->dev_lock, flags); + } + + return vdev; +} + +static struct vhba_command *vhba_alloc_command (void) +{ + struct vhba_host *vhost; + struct vhba_command *vcmd; + unsigned long flags; + int i; + + vhost = platform_get_drvdata(&vhba_platform_device); + + spin_lock_irqsave(&vhost->cmd_lock, flags); + + vcmd = vhost->commands + vhost->cmd_next++; + if (vcmd->status != VHBA_REQ_FREE) { + for (i = 0; i < vhost->shost->can_queue; i++) { + vcmd = vhost->commands + i; + + if (vcmd->status == VHBA_REQ_FREE) { + vhost->cmd_next = i + 1; + break; + } + } + + if (i == vhost->shost->can_queue) { + vcmd = NULL; + } + } + + if (vcmd) { + vcmd->status = VHBA_REQ_PENDING; + } + + vhost->cmd_next %= vhost->shost->can_queue; + + spin_unlock_irqrestore(&vhost->cmd_lock, flags); + + return vcmd; +} + +static void vhba_free_command (struct vhba_command *vcmd) +{ + struct vhba_host *vhost; + unsigned long flags; + + vhost = platform_get_drvdata(&vhba_platform_device); + + spin_lock_irqsave(&vhost->cmd_lock, flags); + vcmd->status = VHBA_REQ_FREE; + spin_unlock_irqrestore(&vhost->cmd_lock, flags); +} + +static int vhba_queuecommand_lck (struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)) +{ + struct vhba_device *vdev; + int retval; + + scmd_dbg(cmd, "queue %lu\n", cmd->serial_number); + + vdev = vhba_lookup_device(cmd->device->id); + if (!vdev) { + scmd_dbg(cmd, "no such device\n"); + + cmd->result = DID_NO_CONNECT << 16; + done(cmd); + + return 0; + } + + cmd->scsi_done = done; + retval = vhba_device_queue(vdev, cmd); + + vhba_device_put(vdev); + + return retval; +} + +#ifdef DEF_SCSI_QCMD +DEF_SCSI_QCMD(vhba_queuecommand) +#else +#define vhba_queuecommand vhba_queuecommand_lck +#endif + +static int vhba_abort (struct scsi_cmnd *cmd) +{ + struct vhba_device *vdev; + int retval = SUCCESS; + + scmd_warn(cmd, "abort %lu\n", cmd->serial_number); + + vdev = vhba_lookup_device(cmd->device->id); + if (vdev) { + retval = vhba_device_dequeue(vdev, cmd); + vhba_device_put(vdev); + } else { + cmd->result = DID_NO_CONNECT << 16; + } + + return retval; +} + +static struct scsi_host_template vhba_template = { + .module = THIS_MODULE, + .name = "vhba", + .proc_name = "vhba", + .queuecommand = vhba_queuecommand, + .eh_abort_handler = vhba_abort, + .can_queue = VHBA_CAN_QUEUE, + .this_id = -1, + .cmd_per_lun = 1, + .max_sectors = VHBA_MAX_SECTORS_PER_IO, + .sg_tablesize = 256, +}; + +static ssize_t do_request (struct scsi_cmnd *cmd, char __user *buf, size_t buf_len) +{ + struct vhba_request vreq; + ssize_t ret; + + scmd_dbg(cmd, "request %lu, cdb 0x%x, bufflen %d, use_sg %d\n", + cmd->serial_number, cmd->cmnd[0], scsi_bufflen(cmd), scsi_sg_count(cmd)); + + ret = sizeof(vreq); + if (DATA_TO_DEVICE(cmd->sc_data_direction)) { + ret += scsi_bufflen(cmd); + } + + if (ret > buf_len) { + scmd_warn(cmd, "buffer too small (%zd < %zd) for a request\n", buf_len, ret); + return -EIO; + } + + vreq.tag = cmd->serial_number; + vreq.lun = cmd->device->lun; + memcpy(vreq.cdb, cmd->cmnd, MAX_COMMAND_SIZE); + vreq.cdb_len = cmd->cmd_len; + vreq.data_len = scsi_bufflen(cmd); + + if (copy_to_user(buf, &vreq, sizeof(vreq))) { + return -EFAULT; + } + + if (DATA_TO_DEVICE(cmd->sc_data_direction) && vreq.data_len) { + buf += sizeof(vreq); + + if (scsi_sg_count(cmd)) { + unsigned char buf_stack[64]; + unsigned char *kaddr, *uaddr, *kbuf; + struct scatterlist *sg = scsi_sglist(cmd); + int i; + + uaddr = (unsigned char *) buf; + + if (vreq.data_len > 64) { + kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + } else { + kbuf = buf_stack; + } + + for (i = 0; i < scsi_sg_count(cmd); i++) { + size_t len = sg[i].length; + +#ifdef USE_SG_PAGE + kaddr = vhba_kmap_atomic(sg_page(&sg[i])); +#else + kaddr = vhba_kmap_atomic(sg[i].page); +#endif + memcpy(kbuf, kaddr + sg[i].offset, len); + vhba_kunmap_atomic(kaddr); + + if (copy_to_user(uaddr, kbuf, len)) { + if (kbuf != buf_stack) { + kfree(kbuf); + } + return -EFAULT; + } + uaddr += len; + } + + if (kbuf != buf_stack) { + kfree(kbuf); + } + } else { + if (copy_to_user(buf, scsi_sglist(cmd), vreq.data_len)) { + return -EFAULT; + } + } + } + + return ret; +} + +static ssize_t do_response (struct scsi_cmnd *cmd, const char __user *buf, size_t buf_len, struct vhba_response *res) +{ + ssize_t ret = 0; + + scmd_dbg(cmd, "response %lu, status %x, data len %d, use_sg %d\n", + cmd->serial_number, res->status, res->data_len, scsi_sg_count(cmd)); + + if (res->status) { + unsigned char sense_stack[SCSI_SENSE_BUFFERSIZE]; + + if (res->data_len > SCSI_SENSE_BUFFERSIZE) { + scmd_warn(cmd, "truncate sense (%d < %d)", SCSI_SENSE_BUFFERSIZE, res->data_len); + res->data_len = SCSI_SENSE_BUFFERSIZE; + } + + /* Copy via temporary buffer on stack in order to avoid problems + with PAX on grsecurity-enabled kernels */ + if (copy_from_user(sense_stack, buf, res->data_len)) { + return -EFAULT; + } + memcpy(cmd->sense_buffer, sense_stack, res->data_len); + + cmd->result = res->status; + + ret += res->data_len; + } else if (DATA_FROM_DEVICE(cmd->sc_data_direction) && scsi_bufflen(cmd)) { + size_t to_read; + + if (res->data_len > scsi_bufflen(cmd)) { + scmd_warn(cmd, "truncate data (%d < %d)\n", scsi_bufflen(cmd), res->data_len); + res->data_len = scsi_bufflen(cmd); + } + + to_read = res->data_len; + + if (scsi_sg_count(cmd)) { + unsigned char buf_stack[64]; + unsigned char *kaddr, *uaddr, *kbuf; + struct scatterlist *sg = scsi_sglist(cmd); + int i; + + uaddr = (unsigned char *)buf; + + if (res->data_len > 64) { + kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + } else { + kbuf = buf_stack; + } + + for (i = 0; i < scsi_sg_count(cmd); i++) { + size_t len = (sg[i].length < to_read) ? sg[i].length : to_read; + + if (copy_from_user(kbuf, uaddr, len)) { + if (kbuf != buf_stack) { + kfree(kbuf); + } + return -EFAULT; + } + uaddr += len; + +#ifdef USE_SG_PAGE + kaddr = vhba_kmap_atomic(sg_page(&sg[i])); +#else + kaddr = vhba_kmap_atomic(sg[i].page); +#endif + memcpy(kaddr + sg[i].offset, kbuf, len); + vhba_kunmap_atomic(kaddr); + + to_read -= len; + if (to_read == 0) { + break; + } + } + + if (kbuf != buf_stack) { + kfree(kbuf); + } + } else { + if (copy_from_user(scsi_sglist(cmd), buf, res->data_len)) { + return -EFAULT; + } + + to_read -= res->data_len; + } + + scsi_set_resid(cmd, to_read); + + ret += res->data_len - to_read; + } + + return ret; +} + +static inline struct vhba_command *next_command (struct vhba_device *vdev) +{ + struct vhba_command *vcmd; + + list_for_each_entry(vcmd, &vdev->cmd_list, entry) { + if (vcmd->status == VHBA_REQ_PENDING) { + break; + } + } + + if (&vcmd->entry == &vdev->cmd_list) { + vcmd = NULL; + } + + return vcmd; +} + +static inline struct vhba_command *match_command (struct vhba_device *vdev, u32 tag) +{ + struct vhba_command *vcmd; + + list_for_each_entry(vcmd, &vdev->cmd_list, entry) { + if (vcmd->cmd->serial_number == tag) { + break; + } + } + + if (&vcmd->entry == &vdev->cmd_list) { + vcmd = NULL; + } + + return vcmd; +} + +static struct vhba_command *wait_command (struct vhba_device *vdev, unsigned long flags) +{ + struct vhba_command *vcmd; + DEFINE_WAIT(wait); + + while (!(vcmd = next_command(vdev))) { + if (signal_pending(current)) { + break; + } + + prepare_to_wait(&vdev->cmd_wq, &wait, TASK_INTERRUPTIBLE); + + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + schedule(); + + spin_lock_irqsave(&vdev->cmd_lock, flags); + } + + finish_wait(&vdev->cmd_wq, &wait); + if (vcmd) { + vcmd->status = VHBA_REQ_READING; + } + + return vcmd; +} + +static ssize_t vhba_ctl_read (struct file *file, char __user *buf, size_t buf_len, loff_t *offset) +{ + struct vhba_device *vdev; + struct vhba_command *vcmd; + ssize_t ret; + unsigned long flags; + + vdev = file->private_data; + + /* Get next command */ + if (file->f_flags & O_NONBLOCK) { + /* Non-blocking variant */ + spin_lock_irqsave(&vdev->cmd_lock, flags); + vcmd = next_command(vdev); + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + if (!vcmd) { + return -EWOULDBLOCK; + } + } else { + /* Blocking variant */ + spin_lock_irqsave(&vdev->cmd_lock, flags); + vcmd = wait_command(vdev, flags); + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + if (!vcmd) { + return -ERESTARTSYS; + } + } + + ret = do_request(vcmd->cmd, buf, buf_len); + + spin_lock_irqsave(&vdev->cmd_lock, flags); + if (ret >= 0) { + vcmd->status = VHBA_REQ_SENT; + *offset += ret; + } else { + vcmd->status = VHBA_REQ_PENDING; + } + + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + return ret; +} + +static ssize_t vhba_ctl_write (struct file *file, const char __user *buf, size_t buf_len, loff_t *offset) +{ + struct vhba_device *vdev; + struct vhba_command *vcmd; + struct vhba_response res; + ssize_t ret; + unsigned long flags; + + if (buf_len < sizeof(res)) { + return -EIO; + } + + if (copy_from_user(&res, buf, sizeof(res))) { + return -EFAULT; + } + + vdev = file->private_data; + + spin_lock_irqsave(&vdev->cmd_lock, flags); + vcmd = match_command(vdev, res.tag); + if (!vcmd || vcmd->status != VHBA_REQ_SENT) { + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + DPRINTK("not expecting response\n"); + return -EIO; + } + vcmd->status = VHBA_REQ_WRITING; + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + ret = do_response(vcmd->cmd, buf + sizeof(res), buf_len - sizeof(res), &res); + + spin_lock_irqsave(&vdev->cmd_lock, flags); + if (ret >= 0) { + vcmd->cmd->scsi_done(vcmd->cmd); + ret += sizeof(res); + + /* don't compete with vhba_device_dequeue */ + if (!list_empty(&vcmd->entry)) { + list_del_init(&vcmd->entry); + vhba_free_command(vcmd); + } + } else { + vcmd->status = VHBA_REQ_SENT; + } + + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + return ret; +} + +static long vhba_ctl_ioctl (struct file *file, unsigned int cmd, unsigned long arg) +{ + struct vhba_device *vdev = file->private_data; + struct vhba_host *vhost; + struct scsi_device *sdev; + + switch (cmd) { + case 0xBEEF001: { + vhost = platform_get_drvdata(&vhba_platform_device); + sdev = scsi_device_lookup(vhost->shost, 0, vdev->id, 0); + + if (sdev) { + int id[4] = { + sdev->host->host_no, + sdev->channel, + sdev->id, + sdev->lun + }; + + scsi_device_put(sdev); + + if (copy_to_user((void *)arg, id, sizeof(id))) { + return -EFAULT; + } + + return 0; + } else { + return -ENODEV; + } + } + } + + return -ENOTTY; +} + +#ifdef CONFIG_COMPAT +static long vhba_ctl_compat_ioctl (struct file *file, unsigned int cmd, unsigned long arg) +{ + unsigned long compat_arg = (unsigned long)compat_ptr(arg); + return vhba_ctl_ioctl(file, cmd, compat_arg); +} +#endif + +static unsigned int vhba_ctl_poll (struct file *file, poll_table *wait) +{ + struct vhba_device *vdev = file->private_data; + unsigned int mask = 0; + unsigned long flags; + + poll_wait(file, &vdev->cmd_wq, wait); + + spin_lock_irqsave(&vdev->cmd_lock, flags); + if (next_command(vdev)) { + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + return mask; +} + +static int vhba_ctl_open (struct inode *inode, struct file *file) +{ + struct vhba_device *vdev; + int retval; + + DPRINTK("open\n"); + + /* check if vhba is probed */ + if (!platform_get_drvdata(&vhba_platform_device)) { + return -ENODEV; + } + + vdev = vhba_device_alloc(); + if (!vdev) { + return -ENOMEM; + } + + if (!(retval = vhba_add_device(vdev))) { + file->private_data = vdev; + } + + vhba_device_put(vdev); + + return retval; +} + +static int vhba_ctl_release (struct inode *inode, struct file *file) +{ + struct vhba_device *vdev; + struct vhba_command *vcmd; + unsigned long flags; + + DPRINTK("release\n"); + + vdev = file->private_data; + + vhba_device_get(vdev); + vhba_remove_device(vdev); + + spin_lock_irqsave(&vdev->cmd_lock, flags); + list_for_each_entry(vcmd, &vdev->cmd_list, entry) { + WARN_ON(vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING); + + scmd_warn(vcmd->cmd, "device released with command %lu\n", vcmd->cmd->serial_number); + vcmd->cmd->result = DID_NO_CONNECT << 16; + vcmd->cmd->scsi_done(vcmd->cmd); + + vhba_free_command(vcmd); + } + INIT_LIST_HEAD(&vdev->cmd_list); + spin_unlock_irqrestore(&vdev->cmd_lock, flags); + + vhba_device_put(vdev); + + return 0; +} + +static struct file_operations vhba_ctl_fops = { + .owner = THIS_MODULE, + .open = vhba_ctl_open, + .release = vhba_ctl_release, + .read = vhba_ctl_read, + .write = vhba_ctl_write, + .poll = vhba_ctl_poll, + .unlocked_ioctl = vhba_ctl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vhba_ctl_compat_ioctl, +#endif +}; + +static struct miscdevice vhba_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "vhba_ctl", + .fops = &vhba_ctl_fops, +}; + +static int vhba_probe (struct platform_device *pdev) +{ + struct Scsi_Host *shost; + struct vhba_host *vhost; + int i; + + shost = scsi_host_alloc(&vhba_template, sizeof(struct vhba_host)); + if (!shost) { + return -ENOMEM; + } + + shost->max_id = VHBA_MAX_ID; + /* we don't support lun > 0 */ + shost->max_lun = 1; + shost->max_cmd_len = MAX_COMMAND_SIZE; + + vhost = (struct vhba_host *)shost->hostdata; + memset(vhost, 0, sizeof(*vhost)); + + vhost->shost = shost; + vhost->num_devices = 0; + spin_lock_init(&vhost->dev_lock); + spin_lock_init(&vhost->cmd_lock); + INIT_WORK(&vhost->scan_devices, vhba_scan_devices); + vhost->cmd_next = 0; + for (i = 0; i < vhost->shost->can_queue; i++) { + vhost->commands[i].status = VHBA_REQ_FREE; + } + + platform_set_drvdata(pdev, vhost); + + if (scsi_add_host(shost, &pdev->dev)) { + scsi_host_put(shost); + return -ENOMEM; + } + + return 0; +} + +static int vhba_remove (struct platform_device *pdev) +{ + struct vhba_host *vhost; + struct Scsi_Host *shost; + + vhost = platform_get_drvdata(pdev); + shost = vhost->shost; + + scsi_remove_host(shost); + scsi_host_put(shost); + + return 0; +} + +static void vhba_release (struct device * dev) +{ + return; +} + +static struct platform_device vhba_platform_device = { + .name = "vhba", + .id = -1, + .dev = { + .release = vhba_release, + }, +}; + +static struct platform_driver vhba_platform_driver = { + .driver = { + .owner = THIS_MODULE, + .name = "vhba", + }, + .probe = vhba_probe, + .remove = vhba_remove, +}; + +static int __init vhba_init (void) +{ + int ret; + + ret = platform_device_register(&vhba_platform_device); + if (ret < 0) { + return ret; + } + + ret = platform_driver_register(&vhba_platform_driver); + if (ret < 0) { + platform_device_unregister(&vhba_platform_device); + return ret; + } + + ret = misc_register(&vhba_miscdev); + if (ret < 0) { + platform_driver_unregister(&vhba_platform_driver); + platform_device_unregister(&vhba_platform_device); + return ret; + } + + return 0; +} + +static void __exit vhba_exit(void) +{ + misc_deregister(&vhba_miscdev); + platform_driver_unregister(&vhba_platform_driver); + platform_device_unregister(&vhba_platform_device); +} + +module_init(vhba_init); +module_exit(vhba_exit); + diff -Nur linux-4.2/drivers/tty/Kconfig linux-4.2-pck/drivers/tty/Kconfig --- linux-4.2/drivers/tty/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/tty/Kconfig 2015-09-08 00:48:22.228363880 -0300 @@ -75,6 +75,124 @@ def_bool y depends on VT_CONSOLE && PM_SLEEP +menuconfig VT_CKO + bool "Colored kernel message output" + depends on VT_CONSOLE + ---help--- + This option enables kernel messages to be emitted in + colors other than the default. + + The color value you need to enter is composed (OR-ed) + of a foreground and a background color. + + Foreground: + 0x00 = black, 0x08 = dark gray, + 0x01 = red, 0x09 = light red, + 0x02 = green, 0x0A = light green, + 0x03 = brown, 0x0B = yellow, + 0x04 = blue, 0x0C = light blue, + 0x05 = magenta, 0x0D = light magenta, + 0x06 = cyan, 0x0E = light cyan, + 0x07 = gray, 0x0F = white, + + (Foreground colors 0x08 to 0x0F do not work when a VGA + console font with 512 glyphs is used.) + + Background: + 0x00 = black, 0x40 = blue, + 0x10 = red, 0x50 = magenta, + 0x20 = green, 0x60 = cyan, + 0x30 = brown, 0x70 = gray, + + For example, 0x1F would yield white on red. + + If unsure, say N. + +config VT_PRINTK_EMERG_COLOR + hex "Emergency messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel emergency messages will + be printed to the console. + +config VT_PRINTK_ALERT_COLOR + hex "Alert messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel alert messages will + be printed to the console. + +config VT_PRINTK_CRIT_COLOR + hex "Critical messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel critical messages will + be printed to the console. + +config VT_PRINTK_ERR_COLOR + hex "Error messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel error messages will + be printed to the console. + +config VT_PRINTK_WARNING_COLOR + hex "Warning messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel warning messages will + be printed to the console. + +config VT_PRINTK_NOTICE_COLOR + hex "Notice messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel notice messages will + be printed to the console. + +config VT_PRINTK_INFO_COLOR + hex "Information messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel information messages will + be printed to the console. + +config VT_PRINTK_DEBUG_COLOR + hex "Debug messages color" + range 0x00 0xFF + depends on VT_CKO + default 0x07 + ---help--- + This option defines with which color kernel debug messages will + be printed to the console. + +config NR_TTY_DEVICES + int "Maximum tty device number" + depends on VT + range 12 63 + default 63 + ---help--- + This option is used to change the number of tty devices in /dev. + The default value is 63. The lowest number you can set is 12, + 63 is also the upper limit so we don't overrun the serial + consoles. + + If unsure, say 63. + config HW_CONSOLE bool depends on VT && !UML diff -Nur linux-4.2/drivers/tty/serial/8250/8250_core.c linux-4.2-pck/drivers/tty/serial/8250/8250_core.c --- linux-4.2/drivers/tty/serial/8250/8250_core.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/tty/serial/8250/8250_core.c 2015-09-08 00:48:22.228363880 -0300 @@ -3339,7 +3339,7 @@ * The console_lock must be held when we get here. */ static void serial8250_console_write(struct uart_8250_port *up, const char *s, - unsigned int count) + unsigned int count, unsigned int loglevel) { struct uart_port *port = &up->port; unsigned long flags; @@ -3413,11 +3413,11 @@ } static void univ8250_console_write(struct console *co, const char *s, - unsigned int count) + unsigned int count, unsigned int loglevel) { struct uart_8250_port *up = &serial8250_ports[co->index]; - serial8250_console_write(up, s, count); + serial8250_console_write(up, s, count, loglevel); } static unsigned int probe_baud(struct uart_port *port) diff -Nur linux-4.2/drivers/tty/serial/8250/8250_early.c linux-4.2-pck/drivers/tty/serial/8250/8250_early.c --- linux-4.2/drivers/tty/serial/8250/8250_early.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/tty/serial/8250/8250_early.c 2015-09-08 00:48:22.228363880 -0300 @@ -90,7 +90,7 @@ } static void __init early_serial8250_write(struct console *console, - const char *s, unsigned int count) + const char *s, unsigned int count, unsigned int loglevel) { struct earlycon_device *device = console->data; struct uart_port *port = &device->port; diff -Nur linux-4.2/drivers/tty/vt/vt.c linux-4.2-pck/drivers/tty/vt/vt.c --- linux-4.2/drivers/tty/vt/vt.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/drivers/tty/vt/vt.c 2015-09-08 00:48:22.228363880 -0300 @@ -71,6 +71,7 @@ */ #include +#include #include #include #include @@ -2531,16 +2532,44 @@ return kmsg_con; } +#ifdef CONFIG_VT_CKO +static unsigned int printk_color[8] __read_mostly = { + CONFIG_VT_PRINTK_EMERG_COLOR, /* KERN_EMERG */ + CONFIG_VT_PRINTK_ALERT_COLOR, /* KERN_ALERT */ + CONFIG_VT_PRINTK_CRIT_COLOR, /* KERN_CRIT */ + CONFIG_VT_PRINTK_ERR_COLOR, /* KERN_ERR */ + CONFIG_VT_PRINTK_WARNING_COLOR, /* KERN_WARNING */ + CONFIG_VT_PRINTK_NOTICE_COLOR, /* KERN_NOTICE */ + CONFIG_VT_PRINTK_INFO_COLOR, /* KERN_INFO */ + CONFIG_VT_PRINTK_DEBUG_COLOR, /* KERN_DEBUG */ +}; +module_param_array(printk_color, uint, NULL, S_IRUGO | S_IWUSR); + +static inline void vc_set_color(struct vc_data *vc, unsigned char color) +{ + vc->vc_color = color_table[color & 0xF] | + (color_table[(color >> 4) & 0x7] << 4) | + (color & 0x80); + update_attr(vc); +} +#else +static unsigned int printk_color[8]; +static inline void vc_set_color(const struct vc_data *vc, unsigned char c) +{ +} +#endif + /* * Console on virtual terminal * * The console must be locked when we get here. */ -static void vt_console_print(struct console *co, const char *b, unsigned count) +static void vt_console_print(struct console *co, const char *b, unsigned count, + unsigned int loglevel) { struct vc_data *vc = vc_cons[fg_console].d; - unsigned char c; + unsigned char current_color, c; static DEFINE_SPINLOCK(printing_lock); const ushort *start; ushort cnt = 0; @@ -2576,11 +2605,20 @@ start = (ushort *)vc->vc_pos; + /* + * We always get a valid loglevel - <8> and "no level" is transformed + * to <4> in the typical kernel. + */ + current_color = printk_color[loglevel]; + vc_set_color(vc, current_color); + + /* Contrived structure to try to emulate original need_wrap behaviour * Problems caused when we have need_wrap set on '\n' character */ while (count--) { c = *b++; if (c == 10 || c == 13 || c == 8 || vc->vc_need_wrap) { + vc_set_color(vc, vc->vc_def_color); if (cnt > 0) { if (CON_IS_VISIBLE(vc)) vc->vc_sw->con_putcs(vc, start, cnt, vc->vc_y, vc->vc_x); @@ -2593,6 +2631,7 @@ bs(vc); start = (ushort *)vc->vc_pos; myx = vc->vc_x; + vc_set_color(vc, current_color); continue; } if (c != 13) @@ -2600,6 +2639,7 @@ cr(vc); start = (ushort *)vc->vc_pos; myx = vc->vc_x; + vc_set_color(vc, current_color); if (c == 10 || c == 13) continue; } @@ -2622,6 +2662,7 @@ vc->vc_need_wrap = 1; } } + vc_set_color(vc, vc->vc_def_color); set_cursor(vc); notify_update(vc); diff -Nur linux-4.2/fs/drop_caches.c linux-4.2-pck/fs/drop_caches.c --- linux-4.2/fs/drop_caches.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/fs/drop_caches.c 2015-09-08 11:20:32.183692475 -0300 @@ -8,6 +8,7 @@ #include #include #include +#include #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ @@ -37,6 +38,12 @@ iput(toput_inode); } +/* For TuxOnIce */ +void drop_pagecache(void) +{ + iterate_supers(drop_pagecache_sb, NULL); +} + int drop_caches_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { diff -Nur linux-4.2/fs/exec.c linux-4.2-pck/fs/exec.c --- linux-4.2/fs/exec.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/fs/exec.c 2015-09-08 00:51:03.460668141 -0300 @@ -19,7 +19,7 @@ * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary - * formats. + * formats. */ #include @@ -56,6 +56,9 @@ #include #include #include +#include + +#include #include #include @@ -784,8 +787,10 @@ if (err) goto exit; - if (name->name[0] != '\0') + if (name->name[0] != '\0') { fsnotify_open(file); + trace_open_exec(name->name); + } out: return file; @@ -1156,6 +1161,7 @@ /* An exec changes our domain. We are no longer part of the thread group */ current->self_exec_id++; + flush_signal_handlers(current, 0); do_close_on_exec(current->files); } diff -Nur linux-4.2/fs/open.c linux-4.2-pck/fs/open.c --- linux-4.2/fs/open.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/fs/open.c 2015-09-08 00:48:22.228363880 -0300 @@ -34,6 +34,9 @@ #include "internal.h" +#define CREATE_TRACE_POINTS +#include + int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { @@ -1029,6 +1032,7 @@ } else { fsnotify_open(f); fd_install(fd, f); + trace_do_sys_open(tmp->name, flags, mode); } } putname(tmp); diff -Nur linux-4.2/fs/proc/meminfo.c linux-4.2-pck/fs/proc/meminfo.c --- linux-4.2/fs/proc/meminfo.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/fs/proc/meminfo.c 2015-09-08 00:51:03.460668141 -0300 @@ -124,6 +124,9 @@ "SUnreclaim: %8lu kB\n" "KernelStack: %8lu kB\n" "PageTables: %8lu kB\n" +#ifdef CONFIG_UKSM + "KsmZeroPages: %8lu kB\n" +#endif #ifdef CONFIG_QUICKLIST "Quicklists: %8lu kB\n" #endif @@ -182,6 +185,9 @@ K(global_page_state(NR_SLAB_UNRECLAIMABLE)), global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, K(global_page_state(NR_PAGETABLE)), +#ifdef CONFIG_UKSM + K(global_page_state(NR_UKSM_ZERO_PAGES)), +#endif #ifdef CONFIG_QUICKLIST K(quicklist_total_size()), #endif diff -Nur linux-4.2/fs/super.c linux-4.2-pck/fs/super.c --- linux-4.2/fs/super.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/fs/super.c 2015-09-08 11:20:32.190358817 -0300 @@ -36,7 +36,7 @@ #include "internal.h" -static LIST_HEAD(super_blocks); +LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { diff -Nur linux-4.2/include/acpi/acconfig.h linux-4.2-pck/include/acpi/acconfig.h --- linux-4.2/include/acpi/acconfig.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/acpi/acconfig.h 2015-09-08 00:48:22.228363880 -0300 @@ -112,9 +112,19 @@ * *****************************************************************************/ -/* Version of ACPI supported */ - +/* + * Version of ACPI supported. This is a sad story. Windows reports a _REV of + * 2 regardless of the spec version implemented. Some vendors are using _REV + * as a way to distinguish between Windows and Linux, and are breaking systems + * in the process. We can't guarantee that they'll call _OSI before checking + * _REV, so hardcode this to 2 on x86 systems right now and leave it at the + * appropriate spec value for everybody else. + */ +#ifdef CONFIG_X86 +#define ACPI_CA_SUPPORT_LEVEL 2 +#else #define ACPI_CA_SUPPORT_LEVEL 5 +#endif /* Maximum count for a semaphore object */ diff -Nur linux-4.2/include/asm-generic/pgtable.h linux-4.2-pck/include/asm-generic/pgtable.h --- linux-4.2/include/asm-generic/pgtable.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/asm-generic/pgtable.h 2015-09-08 00:51:03.460668141 -0300 @@ -555,12 +555,25 @@ unsigned long size); #endif +#ifdef CONFIG_UKSM +static inline int is_uksm_zero_pfn(unsigned long pfn) +{ + extern unsigned long uksm_zero_pfn; + return pfn == uksm_zero_pfn; +} +#else +static inline int is_uksm_zero_pfn(unsigned long pfn) +{ + return 0; +} +#endif + #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; - return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); + return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) @@ -569,7 +582,7 @@ static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; - return pfn == zero_pfn; + return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); } static inline unsigned long my_zero_pfn(unsigned long addr) diff -Nur linux-4.2/include/linux/bio.h linux-4.2-pck/include/linux/bio.h --- linux-4.2/include/linux/bio.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/bio.h 2015-09-08 11:20:32.227023691 -0300 @@ -32,6 +32,8 @@ /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include +extern int trap_non_toi_io; + #define BIO_DEBUG #ifdef BIO_DEBUG diff -Nur linux-4.2/include/linux/blk_types.h linux-4.2-pck/include/linux/blk_types.h --- linux-4.2/include/linux/blk_types.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/blk_types.h 2015-09-08 11:20:32.253689056 -0300 @@ -121,13 +121,14 @@ #define BIO_SNAP_STABLE 7 /* bio data must be snapshotted during write */ #define BIO_CHAIN 8 /* chained bio, ->bi_remaining in effect */ #define BIO_REFFED 9 /* bio has elevated ->bi_cnt */ +#define BIO_TOI 10 /* bio is TuxOnIce submitted */ /* * Flags starting here get preserved by bio_reset() - this includes * BIO_POOL_IDX() */ -#define BIO_RESET_BITS 13 -#define BIO_OWNS_VEC 13 /* bio_free() should free bvec */ +#define BIO_RESET_BITS 14 +#define BIO_OWNS_VEC 14 /* bio_free() should free bvec */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) diff -Nur linux-4.2/include/linux/cgroup_subsys.h linux-4.2-pck/include/linux/cgroup_subsys.h --- linux-4.2/include/linux/cgroup_subsys.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/cgroup_subsys.h 2015-09-08 00:48:22.228363880 -0300 @@ -35,6 +35,10 @@ SUBSYS(net_cls) #endif +#if IS_ENABLED(CONFIG_CGROUP_BFQIO) +SUBSYS(bfqio) +#endif + #if IS_ENABLED(CONFIG_CGROUP_PERF) SUBSYS(perf_event) #endif diff -Nur linux-4.2/include/linux/console.h linux-4.2-pck/include/linux/console.h --- linux-4.2/include/linux/console.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/console.h 2015-09-08 00:48:22.228363880 -0300 @@ -119,7 +119,7 @@ struct console { char name[16]; - void (*write)(struct console *, const char *, unsigned); + void (*write)(struct console *, const char *, unsigned, unsigned int); int (*read)(struct console *, char *, unsigned); struct tty_driver *(*device)(struct console *, int *); void (*unblank)(void); diff -Nur linux-4.2/include/linux/fs.h linux-4.2-pck/include/linux/fs.h --- linux-4.2/include/linux/fs.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/fs.h 2015-09-08 11:20:32.257022227 -0300 @@ -1258,6 +1258,8 @@ #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ +extern struct list_head super_blocks; + /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ @@ -1741,6 +1743,8 @@ #else #define S_DAX 0 /* Make all the DAX code disappear */ #endif +#define S_ATOMIC_COPY 16384 /* Pages mapped with this inode need to be + atomically copied (gem) */ /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -2269,6 +2273,13 @@ extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); +extern int fsync_super(struct super_block *); +extern int fsync_no_super(struct block_device *); +#define FS_FREEZER_FUSE 1 +#define FS_FREEZER_NORMAL 2 +#define FS_FREEZER_ALL (FS_FREEZER_FUSE | FS_FREEZER_NORMAL) +void freeze_filesystems(int which); +void thaw_filesystems(int which); extern struct super_block *blockdev_superblock; diff -Nur linux-4.2/include/linux/fs_uuid.h linux-4.2-pck/include/linux/fs_uuid.h --- linux-4.2/include/linux/fs_uuid.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/linux/fs_uuid.h 2015-09-08 11:20:32.257022227 -0300 @@ -0,0 +1,19 @@ +#include + +struct hd_struct; +struct block_device; + +struct fs_info { + char uuid[16]; + dev_t dev_t; + char *last_mount; + int last_mount_size; +}; + +int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek); +dev_t blk_lookup_fs_info(struct fs_info *seek); +struct fs_info *fs_info_from_block_dev(struct block_device *bdev); +void free_fs_info(struct fs_info *fs_info); +int bdev_matches_key(struct block_device *bdev, const char *key); +struct block_device *next_bdev_of_type(struct block_device *last, + const char *key); diff -Nur linux-4.2/include/linux/gfp.h linux-4.2-pck/include/linux/gfp.h --- linux-4.2/include/linux/gfp.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/gfp.h 2015-09-08 11:20:32.260355398 -0300 @@ -35,6 +35,7 @@ #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u #define ___GFP_WRITE 0x1000000u +#define ___GFP_TOI_NOTRACK 0x2000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -94,6 +95,7 @@ #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ +#define __GFP_TOI_NOTRACK ((__force gfp_t)___GFP_TOI_NOTRACK) /* Allocator wants page untracked by TOI */ /* * This may seem redundant, but it's a way of annotating false positives vs. @@ -101,7 +103,7 @@ */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 26 /* Room for N __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ diff -Nur linux-4.2/include/linux/ksm.h linux-4.2-pck/include/linux/ksm.h --- linux-4.2/include/linux/ksm.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/ksm.h 2015-09-08 00:51:03.460668141 -0300 @@ -19,21 +19,6 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); -int __ksm_enter(struct mm_struct *mm); -void __ksm_exit(struct mm_struct *mm); - -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -{ - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) - return __ksm_enter(mm); - return 0; -} - -static inline void ksm_exit(struct mm_struct *mm) -{ - if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) - __ksm_exit(mm); -} static inline struct stable_node *page_stable_node(struct page *page) { @@ -64,6 +49,33 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); +#ifdef CONFIG_KSM_LEGACY +int __ksm_enter(struct mm_struct *mm); +void __ksm_exit(struct mm_struct *mm); +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) + return __ksm_enter(mm); + return 0; +} + +static inline void ksm_exit(struct mm_struct *mm) +{ + if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) + __ksm_exit(mm); +} + +#elif defined(CONFIG_UKSM) +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + return 0; +} + +static inline void ksm_exit(struct mm_struct *mm) +{ +} +#endif /* !CONFIG_UKSM */ + #else /* !CONFIG_KSM */ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) @@ -106,4 +118,6 @@ #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ +#include + #endif /* __LINUX_KSM_H */ diff -Nur linux-4.2/include/linux/libata.h linux-4.2-pck/include/linux/libata.h --- linux-4.2/include/linux/libata.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/libata.h 2015-09-08 00:48:22.228363880 -0300 @@ -516,6 +516,7 @@ enum ata_lpm_policy { ATA_LPM_UNKNOWN, ATA_LPM_MAX_POWER, + ATA_LPM_FIRMWARE_DEFAULTS, ATA_LPM_MED_POWER, ATA_LPM_MIN_POWER, }; @@ -728,6 +729,8 @@ int spdn_cnt; /* ering is CLEAR_END, read comment above CLEAR_END */ struct ata_ering ering; + /* Initial DIPM configuration */ + bool init_dipm; }; /* Fields between ATA_DEVICE_CLEAR_BEGIN and ATA_DEVICE_CLEAR_END are @@ -798,6 +801,7 @@ struct ata_device device[ATA_MAX_DEVICES]; + u8 init_lpm; /* initial lpm configuration */ unsigned long last_lpm_change; /* when last LPM change happened */ }; #define ATA_LINK_CLEAR_BEGIN offsetof(struct ata_link, active_tag) diff -Nur linux-4.2/include/linux/mm.h linux-4.2-pck/include/linux/mm.h --- linux-4.2/include/linux/mm.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/mm.h 2015-09-08 11:20:32.273688080 -0300 @@ -2137,6 +2137,7 @@ void __user *, size_t *, loff_t *); #endif +void drop_pagecache(void); void drop_slab(void); void drop_slab_node(int nid); diff -Nur linux-4.2/include/linux/mm_types.h linux-4.2-pck/include/linux/mm_types.h --- linux-4.2/include/linux/mm_types.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/mm_types.h 2015-09-08 00:51:03.460668141 -0300 @@ -322,6 +322,9 @@ #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif +#ifdef CONFIG_UKSM + struct vma_slot *uksm_vma_slot; +#endif }; struct core_thread { diff -Nur linux-4.2/include/linux/mmzone.h linux-4.2-pck/include/linux/mmzone.h --- linux-4.2/include/linux/mmzone.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/mmzone.h 2015-09-08 00:51:03.460668141 -0300 @@ -157,6 +157,9 @@ WORKINGSET_NODERECLAIM, NR_ANON_TRANSPARENT_HUGEPAGES, NR_FREE_CMA_PAGES, +#ifdef CONFIG_UKSM + NR_UKSM_ZERO_PAGES, +#endif NR_VM_ZONE_STAT_ITEMS }; /* @@ -872,7 +875,7 @@ } /** - * is_highmem - helper function to quickly check if a struct zone is a + * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone - pointer to struct zone variable diff -Nur linux-4.2/include/linux/page-flags.h linux-4.2-pck/include/linux/page-flags.h --- linux-4.2/include/linux/page-flags.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/page-flags.h 2015-09-08 11:20:32.280354422 -0300 @@ -109,6 +109,12 @@ #ifdef CONFIG_TRANSPARENT_HUGEPAGE PG_compound_lock, #endif +#ifdef CONFIG_TOI_INCREMENTAL + PG_toi_untracked, /* Don't track dirtiness of this page - assume always dirty */ + PG_toi_ro, /* Page was made RO by TOI */ + PG_toi_cbw, /* Copy the page before it is written to */ + PG_toi_dirty, /* Page has been modified */ +#endif __NR_PAGEFLAGS, /* Filesystems */ @@ -288,6 +294,17 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif +#ifdef CONFIG_TOI_INCREMENTAL +PAGEFLAG(TOI_RO, toi_ro) +PAGEFLAG(TOI_Dirty, toi_dirty) +PAGEFLAG(TOI_Untracked, toi_untracked) +PAGEFLAG(TOI_CBW, toi_cbw) +#else +PAGEFLAG_FALSE(TOI_RO) +PAGEFLAG_FALSE(TOI_Dirty) +PAGEFLAG_FALSE(TOI_Untracked) +PAGEFLAG_FALSE(TOI_CBW) +#endif /* * On an anonymous page mapped into a user virtual memory area, @@ -642,8 +659,12 @@ * __PG_HWPOISON is exceptional because it needs to be kept beyond page's * alloc-free cycle to prevent from reusing the page. */ -#define PAGE_FLAGS_CHECK_AT_PREP \ - (((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) +#ifdef CONFIG_TOI_INCREMENTAL +#define PAGE_FLAGS_CHECK_AT_PREP (((1 << NR_PAGEFLAGS) - 1) & \ + ~((1 << PG_toi_dirty) | (1 << PG_toi_ro) | ~__PG_HWPOISON)) +#else +#define PAGE_FLAGS_CHECK_AT_PREP (((1 << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) +#endif #define PAGE_FLAGS_PRIVATE \ (1 << PG_private | 1 << PG_private_2) diff -Nur linux-4.2/include/linux/shmem_fs.h linux-4.2-pck/include/linux/shmem_fs.h --- linux-4.2/include/linux/shmem_fs.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/shmem_fs.h 2015-09-08 11:20:32.283687592 -0300 @@ -48,9 +48,10 @@ extern int shmem_init(void); extern int shmem_fill_super(struct super_block *sb, void *data, int silent); extern struct file *shmem_file_setup(const char *name, - loff_t size, unsigned long flags); + loff_t size, unsigned long flags, + int atomic_copy); extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, - unsigned long flags); + unsigned long flags, int atomic_copy); extern int shmem_zero_setup(struct vm_area_struct *); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); extern bool shmem_mapping(struct address_space *mapping); diff -Nur linux-4.2/include/linux/sradix-tree.h linux-4.2-pck/include/linux/sradix-tree.h --- linux-4.2/include/linux/sradix-tree.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/linux/sradix-tree.h 2015-09-08 00:51:03.460668141 -0300 @@ -0,0 +1,77 @@ +#ifndef _LINUX_SRADIX_TREE_H +#define _LINUX_SRADIX_TREE_H + + +#define INIT_SRADIX_TREE(root, mask) \ +do { \ + (root)->height = 0; \ + (root)->gfp_mask = (mask); \ + (root)->rnode = NULL; \ +} while (0) + +#define ULONG_BITS (sizeof(unsigned long) * 8) +#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +//#define SRADIX_TREE_MAP_SHIFT 6 +//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) +//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) + +struct sradix_tree_node { + unsigned int height; /* Height from the bottom */ + unsigned int count; + unsigned int fulls; /* Number of full sublevel trees */ + struct sradix_tree_node *parent; + void *stores[0]; +}; + +/* A simple radix tree implementation */ +struct sradix_tree_root { + unsigned int height; + struct sradix_tree_node *rnode; + + /* Where found to have available empty stores in its sublevels */ + struct sradix_tree_node *enter_node; + unsigned int shift; + unsigned int stores_size; + unsigned int mask; + unsigned long min; /* The first hole index */ + unsigned long num; + //unsigned long *height_to_maxindex; + + /* How the node is allocated and freed. */ + struct sradix_tree_node *(*alloc)(void); + void (*free)(struct sradix_tree_node *node); + + /* When a new node is added and removed */ + void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); + void (*assign)(struct sradix_tree_node *node, unsigned index, void *item); + void (*rm)(struct sradix_tree_node *node, unsigned offset); +}; + +struct sradix_tree_path { + struct sradix_tree_node *node; + int offset; +}; + +static inline +void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) +{ + root->height = 0; + root->rnode = NULL; + root->shift = shift; + root->stores_size = 1UL << shift; + root->mask = root->stores_size - 1; +} + + +extern void *sradix_tree_next(struct sradix_tree_root *root, + struct sradix_tree_node *node, unsigned long index, + int (*iter)(void *, unsigned long)); + +extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); + +extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, + struct sradix_tree_node *node, unsigned long index); + +extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); + +#endif /* _LINUX_SRADIX_TREE_H */ diff -Nur linux-4.2/include/linux/suspend.h linux-4.2-pck/include/linux/suspend.h --- linux-4.2/include/linux/suspend.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/suspend.h 2015-09-08 11:20:32.287020763 -0300 @@ -452,6 +452,74 @@ #define pm_print_times_enabled (false) #endif +enum { + TOI_CAN_HIBERNATE, + TOI_CAN_RESUME, + TOI_RESUME_DEVICE_OK, + TOI_NORESUME_SPECIFIED, + TOI_SANITY_CHECK_PROMPT, + TOI_CONTINUE_REQ, + TOI_RESUMED_BEFORE, + TOI_BOOT_TIME, + TOI_NOW_RESUMING, + TOI_IGNORE_LOGLEVEL, + TOI_TRYING_TO_RESUME, + TOI_LOADING_ALT_IMAGE, + TOI_STOP_RESUME, + TOI_IO_STOPPED, + TOI_NOTIFIERS_PREPARE, + TOI_CLUSTER_MODE, + TOI_BOOT_KERNEL, + TOI_DEVICE_HOTPLUG_LOCKED, +}; + +#ifdef CONFIG_TOI + +/* Used in init dir files */ +extern unsigned long toi_state; +#define set_toi_state(bit) (set_bit(bit, &toi_state)) +#define clear_toi_state(bit) (clear_bit(bit, &toi_state)) +#define test_toi_state(bit) (test_bit(bit, &toi_state)) +extern int toi_running; + +#define test_action_state(bit) (test_bit(bit, &toi_bkd.toi_action)) +extern int try_tuxonice_hibernate(void); + +#else /* !CONFIG_TOI */ + +#define toi_state (0) +#define set_toi_state(bit) do { } while (0) +#define clear_toi_state(bit) do { } while (0) +#define test_toi_state(bit) (0) +#define toi_running (0) + +static inline int try_tuxonice_hibernate(void) { return 0; } +#define test_action_state(bit) (0) + +#endif /* CONFIG_TOI */ + +#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_TOI +extern void try_tuxonice_resume(void); +#else +#define try_tuxonice_resume() do { } while (0) +#endif + +extern int resume_attempted; +extern int software_resume(void); + +static inline void check_resume_attempted(void) +{ + if (resume_attempted) + return; + + software_resume(); +} +#else +#define check_resume_attempted() do { } while (0) +#define resume_attempted (0) +#endif + #ifdef CONFIG_PM_AUTOSLEEP /* kernel/power/autosleep.c */ diff -Nur linux-4.2/include/linux/swap.h linux-4.2-pck/include/linux/swap.h --- linux-4.2/include/linux/swap.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/swap.h 2015-09-08 11:20:32.287020763 -0300 @@ -289,6 +289,7 @@ extern unsigned long totalreserve_pages; extern unsigned long dirty_balance_reserve; extern unsigned long nr_free_buffer_pages(void); +extern unsigned long nr_unallocated_buffer_pages(void); extern unsigned long nr_free_pagecache_pages(void); /* Definition of global_page_state not available yet */ @@ -328,6 +329,8 @@ struct zone *zone, unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); +extern unsigned long shrink_memory_mask(unsigned long nr_to_reclaim, + gfp_t mask); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); extern unsigned long vm_total_pages; @@ -428,13 +431,17 @@ extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); +extern sector_t map_swap_entry(swp_entry_t entry, struct block_device **); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); +extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int page_swapcount(struct page *); extern struct swap_info_struct *page_swap_info(struct page *); extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; +extern void get_swap_range_of_type(int type, swp_entry_t *start, + swp_entry_t *end, unsigned int limit); #else /* CONFIG_SWAP */ diff -Nur linux-4.2/include/linux/tcp.h linux-4.2-pck/include/linux/tcp.h --- linux-4.2/include/linux/tcp.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/tcp.h 2015-09-08 00:48:31.501257347 -0300 @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -328,6 +329,21 @@ struct tcp_md5sig_info __rcu *md5sig_info; #endif +#ifdef CONFIG_TCP_STEALTH +/* Stealth TCP socket configuration */ + struct { + #define TCP_STEALTH_MODE_AUTH BIT(0) + #define TCP_STEALTH_MODE_INTEGRITY BIT(1) + #define TCP_STEALTH_MODE_INTEGRITY_LEN BIT(2) + u8 mode; + u8 secret[MD5_MESSAGE_BYTES]; + u16 integrity_hash; + size_t integrity_len; + struct skb_mstamp mstamp; + bool saw_tsval; + } stealth; +#endif + /* TCP fastopen related information */ struct tcp_fastopen_request *fastopen_req; /* fastopen_rsk points to request_sock that resulted in this big diff -Nur linux-4.2/include/linux/thinkpad_ec.h linux-4.2-pck/include/linux/thinkpad_ec.h --- linux-4.2/include/linux/thinkpad_ec.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/linux/thinkpad_ec.h 2015-09-08 00:48:22.228363880 -0300 @@ -0,0 +1,47 @@ +/* + * thinkpad_ec.h - interface to ThinkPad embedded controller LPC3 functions + * + * Copyright (C) 2005 Shem Multinymous + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _THINKPAD_EC_H +#define _THINKPAD_EC_H + +#ifdef __KERNEL__ + +#define TP_CONTROLLER_ROW_LEN 16 + +/* EC transactions input and output (possibly partial) vectors of 16 bytes. */ +struct thinkpad_ec_row { + u16 mask; /* bitmap of which entries of val[] are meaningful */ + u8 val[TP_CONTROLLER_ROW_LEN]; +}; + +extern int __must_check thinkpad_ec_lock(void); +extern int __must_check thinkpad_ec_try_lock(void); +extern void thinkpad_ec_unlock(void); + +extern int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, + struct thinkpad_ec_row *data); +extern int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, + struct thinkpad_ec_row *mask); +extern int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args); +extern void thinkpad_ec_invalidate(void); + + +#endif /* __KERNEL */ +#endif /* _THINKPAD_EC_H */ diff -Nur linux-4.2/include/linux/thread_info.h linux-4.2-pck/include/linux/thread_info.h --- linux-4.2/include/linux/thread_info.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/linux/thread_info.h 2015-09-08 11:20:32.287020763 -0300 @@ -56,9 +56,9 @@ #ifdef __KERNEL__ #ifdef CONFIG_DEBUG_STACK_USAGE -# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK | ___GFP_TOI_NOTRACK | __GFP_ZERO) #else -# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) +# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK | ___GFP_TOI_NOTRACK) #endif /* diff -Nur linux-4.2/include/linux/tuxonice.h linux-4.2-pck/include/linux/tuxonice.h --- linux-4.2/include/linux/tuxonice.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/linux/tuxonice.h 2015-09-08 11:20:32.287020763 -0300 @@ -0,0 +1,48 @@ +/* + * include/linux/tuxonice.h + * + * Copyright (C) 2015 Nigel Cunningham (nigel at tuxonice net) + * + * This file is released under the GPLv2. + */ + +#ifndef INCLUDE_LINUX_TUXONICE_H +#define INCLUDE_LINUX_TUXONICE_H +#ifdef CONFIG_TOI_INCREMENTAL +extern void toi_set_logbuf_untracked(void); +extern int toi_make_writable(pgd_t *pgd, unsigned long address); + +static inline int toi_incremental_support(void) +{ + return 1; +} + +/* Copy Before Write */ +struct toi_cbw { + unsigned long pfn; + void *virt; + struct toi_cbw *next; +}; + +struct toi_cbw_state { + bool active; /* Is a fault handler running? */ + bool enabled; /* Are we doing copy before write? */ + int size; /* The number of pages allocated */ + struct toi_cbw *first, *next, *last; /* Pointers to the data structure */ +}; + +#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw)) +DECLARE_PER_CPU(struct toi_cbw_state, toi_cbw_states); +#else +#define toi_set_logbuf_untracked() do { } while(0) +static inline int toi_make_writable(pgd_t *pgd, unsigned long addr) +{ + return 0; +} + +static inline int toi_incremental_support(void) +{ + return 0; +} +#endif +#endif diff -Nur linux-4.2/include/linux/uksm.h linux-4.2-pck/include/linux/uksm.h --- linux-4.2/include/linux/uksm.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/linux/uksm.h 2015-09-08 00:51:03.460668141 -0300 @@ -0,0 +1,146 @@ +#ifndef __LINUX_UKSM_H +#define __LINUX_UKSM_H +/* + * Memory merging support. + * + * This code enables dynamic sharing of identical pages found in different + * memory areas, even if they are not shared by fork(). + */ + +/* if !CONFIG_UKSM this file should not be compiled at all. */ +#ifdef CONFIG_UKSM + +#include +#include +#include +#include +#include + +extern unsigned long zero_pfn __read_mostly; +extern unsigned long uksm_zero_pfn __read_mostly; +extern struct page *empty_uksm_zero_page; + +/* must be done before linked to mm */ +extern void uksm_vma_add_new(struct vm_area_struct *vma); +extern void uksm_remove_vma(struct vm_area_struct *vma); + +#define UKSM_SLOT_NEED_SORT (1 << 0) +#define UKSM_SLOT_NEED_RERAND (1 << 1) +#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ +#define UKSM_SLOT_FUL_SCANNED (1 << 3) +#define UKSM_SLOT_IN_UKSM (1 << 4) + +struct vma_slot { + struct sradix_tree_node *snode; + unsigned long sindex; + + struct list_head slot_list; + unsigned long fully_scanned_round; + unsigned long dedup_num; + unsigned long pages_scanned; + unsigned long last_scanned; + unsigned long pages_to_scan; + struct scan_rung *rung; + struct page **rmap_list_pool; + unsigned int *pool_counts; + unsigned long pool_size; + struct vm_area_struct *vma; + struct mm_struct *mm; + unsigned long ctime_j; + unsigned long pages; + unsigned long flags; + unsigned long pages_cowed; /* pages cowed this round */ + unsigned long pages_merged; /* pages merged this round */ + unsigned long pages_bemerged; + + /* when it has page merged in this eval round */ + struct list_head dedup_list; +}; + +static inline void uksm_unmap_zero_page(pte_t pte) +{ + if (pte_pfn(pte) == uksm_zero_pfn) + __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); +} + +static inline void uksm_map_zero_page(pte_t pte) +{ + if (pte_pfn(pte) == uksm_zero_pfn) + __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); +} + +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) +{ + if (vma->uksm_vma_slot && PageKsm(page)) + vma->uksm_vma_slot->pages_cowed++; +} + +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) +{ + if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) + vma->uksm_vma_slot->pages_cowed++; +} + +static inline int uksm_flags_can_scan(unsigned long vm_flags) +{ +#ifndef VM_SAO +#define VM_SAO 0 +#endif + return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_HUGETLB | VM_MIXEDMAP | VM_SHARED + | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN | VM_SAO)); +} + +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) +{ + if (uksm_flags_can_scan(*vm_flags_p)) + *vm_flags_p |= VM_MERGEABLE; +} + +/* + * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will + * be removed when uksm zero page patch is stable enough. + */ +static inline void uksm_bugon_zeropage(pte_t pte) +{ + BUG_ON(pte_pfn(pte) == uksm_zero_pfn); +} +#else +static inline void uksm_vma_add_new(struct vm_area_struct *vma) +{ +} + +static inline void uksm_remove_vma(struct vm_area_struct *vma) +{ +} + +static inline void uksm_unmap_zero_page(pte_t pte) +{ +} + +static inline void uksm_map_zero_page(pte_t pte) +{ +} + +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) +{ +} + +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) +{ +} + +static inline int uksm_flags_can_scan(unsigned long vm_flags) +{ + return 0; +} + +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) +{ +} + +static inline void uksm_bugon_zeropage(pte_t pte) +{ +} +#endif /* !CONFIG_UKSM */ +#endif /* __LINUX_UKSM_H */ diff -Nur linux-4.2/include/net/netfilter/nf_conntrack.h linux-4.2-pck/include/net/netfilter/nf_conntrack.h --- linux-4.2/include/net/netfilter/nf_conntrack.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/net/netfilter/nf_conntrack.h 2015-09-08 00:48:22.228363880 -0300 @@ -292,6 +292,7 @@ void init_nf_conntrack_hash_rnd(void); struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags); +void nf_ct_tmpl_free(struct nf_conn *tmpl); #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) diff -Nur linux-4.2/include/net/secure_seq.h linux-4.2-pck/include/net/secure_seq.h --- linux-4.2/include/net/secure_seq.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/net/secure_seq.h 2015-09-08 00:48:31.501257347 -0300 @@ -14,5 +14,10 @@ __be16 sport, __be16 dport); u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, __be16 sport, __be16 dport); +#ifdef CONFIG_TCP_STEALTH +u32 tcp_stealth_do_auth(struct sock *sk, struct sk_buff *skb); +u32 tcp_stealth_sequence_number(struct sock *sk, __be32 *daddr, + u32 daddr_size, __be16 dport); +#endif #endif /* _NET_SECURE_SEQ */ diff -Nur linux-4.2/include/net/tcp.h linux-4.2-pck/include/net/tcp.h --- linux-4.2/include/net/tcp.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/net/tcp.h 2015-09-08 00:48:31.501257347 -0300 @@ -440,6 +440,12 @@ struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); +#ifdef CONFIG_TCP_STEALTH +const bool tcp_parse_tsval_option(u32 *tsval, const struct tcphdr *th); +int tcp_stealth_integrity(u16 *hash, u8 *secret, u8 *payload, int len); +#define be32_isn_to_be16_av(x) (((__be16 *)&x)[0]) +#define be32_isn_to_be16_ih(x) (((__be16 *)&x)[1]) +#endif /* * TCP v4 functions exported for the inet6 API diff -Nur linux-4.2/include/trace/events/fs.h linux-4.2-pck/include/trace/events/fs.h --- linux-4.2/include/trace/events/fs.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/trace/events/fs.h 2015-09-08 00:48:22.228363880 -0300 @@ -0,0 +1,53 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fs + +#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FS_H + +#include +#include + +TRACE_EVENT(do_sys_open, + + TP_PROTO(const char *filename, int flags, int mode), + + TP_ARGS(filename, flags, mode), + + TP_STRUCT__entry( + __string( filename, filename ) + __field( int, flags ) + __field( int, mode ) + ), + + TP_fast_assign( + __assign_str(filename, filename); + __entry->flags = flags; + __entry->mode = mode; + ), + + TP_printk("\"%s\" %x %o", + __get_str(filename), __entry->flags, __entry->mode) +); + +TRACE_EVENT(open_exec, + + TP_PROTO(const char *filename), + + TP_ARGS(filename), + + TP_STRUCT__entry( + __string( filename, filename ) + ), + + TP_fast_assign( + __assign_str(filename, filename); + ), + + TP_printk("\"%s\"", + __get_str(filename)) +); + +#endif /* _TRACE_FS_H */ + +/* This part must be outside protection */ +#include diff -Nur linux-4.2/include/uapi/linux/Kbuild linux-4.2-pck/include/uapi/linux/Kbuild --- linux-4.2/include/uapi/linux/Kbuild 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/uapi/linux/Kbuild 2015-09-08 00:48:22.228363880 -0300 @@ -216,6 +216,7 @@ header-y += jffs2.h header-y += joystick.h header-y += kcmp.h +header-y += kdbus.h header-y += kdev_t.h header-y += kd.h header-y += kernelcapi.h diff -Nur linux-4.2/include/uapi/linux/kdbus.h linux-4.2-pck/include/uapi/linux/kdbus.h --- linux-4.2/include/uapi/linux/kdbus.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/include/uapi/linux/kdbus.h 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,984 @@ +/* + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef _UAPI_KDBUS_H_ +#define _UAPI_KDBUS_H_ + +#include +#include + +#define KDBUS_IOCTL_MAGIC 0x95 +#define KDBUS_SRC_ID_KERNEL (0) +#define KDBUS_DST_ID_NAME (0) +#define KDBUS_MATCH_ID_ANY (~0ULL) +#define KDBUS_DST_ID_BROADCAST (~0ULL) +#define KDBUS_FLAG_NEGOTIATE (1ULL << 63) + +/** + * struct kdbus_notify_id_change - name registry change message + * @id: New or former owner of the name + * @flags: flags field from KDBUS_HELLO_* + * + * Sent from kernel to userspace when the owner or activator of + * a well-known name changes. + * + * Attached to: + * KDBUS_ITEM_ID_ADD + * KDBUS_ITEM_ID_REMOVE + */ +struct kdbus_notify_id_change { + __u64 id; + __u64 flags; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_notify_name_change - name registry change message + * @old_id: ID and flags of former owner of a name + * @new_id: ID and flags of new owner of a name + * @name: Well-known name + * + * Sent from kernel to userspace when the owner or activator of + * a well-known name changes. + * + * Attached to: + * KDBUS_ITEM_NAME_ADD + * KDBUS_ITEM_NAME_REMOVE + * KDBUS_ITEM_NAME_CHANGE + */ +struct kdbus_notify_name_change { + struct kdbus_notify_id_change old_id; + struct kdbus_notify_id_change new_id; + char name[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_creds - process credentials + * @uid: User ID + * @euid: Effective UID + * @suid: Saved UID + * @fsuid: Filesystem UID + * @gid: Group ID + * @egid: Effective GID + * @sgid: Saved GID + * @fsgid: Filesystem GID + * + * Attached to: + * KDBUS_ITEM_CREDS + */ +struct kdbus_creds { + __u64 uid; + __u64 euid; + __u64 suid; + __u64 fsuid; + __u64 gid; + __u64 egid; + __u64 sgid; + __u64 fsgid; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_pids - process identifiers + * @pid: Process ID + * @tid: Thread ID + * @ppid: Parent process ID + * + * The PID and TID of a process. + * + * Attached to: + * KDBUS_ITEM_PIDS + */ +struct kdbus_pids { + __u64 pid; + __u64 tid; + __u64 ppid; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_caps - process capabilities + * @last_cap: Highest currently known capability bit + * @caps: Variable number of 32-bit capabilities flags + * + * Contains a variable number of 32-bit capabilities flags. + * + * Attached to: + * KDBUS_ITEM_CAPS + */ +struct kdbus_caps { + __u32 last_cap; + __u32 caps[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_audit - audit information + * @sessionid: The audit session ID + * @loginuid: The audit login uid + * + * Attached to: + * KDBUS_ITEM_AUDIT + */ +struct kdbus_audit { + __u32 sessionid; + __u32 loginuid; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_timestamp + * @seqnum: Global per-domain message sequence number + * @monotonic_ns: Monotonic timestamp, in nanoseconds + * @realtime_ns: Realtime timestamp, in nanoseconds + * + * Attached to: + * KDBUS_ITEM_TIMESTAMP + */ +struct kdbus_timestamp { + __u64 seqnum; + __u64 monotonic_ns; + __u64 realtime_ns; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_vec - I/O vector for kdbus payload items + * @size: The size of the vector + * @address: Memory address of data buffer + * @offset: Offset in the in-message payload memory, + * relative to the message head + * + * Attached to: + * KDBUS_ITEM_PAYLOAD_VEC, KDBUS_ITEM_PAYLOAD_OFF + */ +struct kdbus_vec { + __u64 size; + union { + __u64 address; + __u64 offset; + }; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_bloom_parameter - bus-wide bloom parameters + * @size: Size of the bit field in bytes (m / 8) + * @n_hash: Number of hash functions used (k) + */ +struct kdbus_bloom_parameter { + __u64 size; + __u64 n_hash; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_bloom_filter - bloom filter containing n elements + * @generation: Generation of the element set in the filter + * @data: Bit field, multiple of 8 bytes + */ +struct kdbus_bloom_filter { + __u64 generation; + __u64 data[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_memfd - a kdbus memfd + * @start: The offset into the memfd where the segment starts + * @size: The size of the memfd segment + * @fd: The file descriptor number + * @__pad: Padding to ensure proper alignment and size + * + * Attached to: + * KDBUS_ITEM_PAYLOAD_MEMFD + */ +struct kdbus_memfd { + __u64 start; + __u64 size; + int fd; + __u32 __pad; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_name - a registered well-known name with its flags + * @flags: Flags from KDBUS_NAME_* + * @name: Well-known name + * + * Attached to: + * KDBUS_ITEM_OWNED_NAME + */ +struct kdbus_name { + __u64 flags; + char name[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_policy_access_type - permissions of a policy record + * @_KDBUS_POLICY_ACCESS_NULL: Uninitialized/invalid + * @KDBUS_POLICY_ACCESS_USER: Grant access to a uid + * @KDBUS_POLICY_ACCESS_GROUP: Grant access to gid + * @KDBUS_POLICY_ACCESS_WORLD: World-accessible + */ +enum kdbus_policy_access_type { + _KDBUS_POLICY_ACCESS_NULL, + KDBUS_POLICY_ACCESS_USER, + KDBUS_POLICY_ACCESS_GROUP, + KDBUS_POLICY_ACCESS_WORLD, +}; + +/** + * enum kdbus_policy_access_flags - mode flags + * @KDBUS_POLICY_OWN: Allow to own a well-known name + * Implies KDBUS_POLICY_TALK and KDBUS_POLICY_SEE + * @KDBUS_POLICY_TALK: Allow communication to a well-known name + * Implies KDBUS_POLICY_SEE + * @KDBUS_POLICY_SEE: Allow to see a well-known name + */ +enum kdbus_policy_type { + KDBUS_POLICY_SEE = 0, + KDBUS_POLICY_TALK, + KDBUS_POLICY_OWN, +}; + +/** + * struct kdbus_policy_access - policy access item + * @type: One of KDBUS_POLICY_ACCESS_* types + * @access: Access to grant + * @id: For KDBUS_POLICY_ACCESS_USER, the uid + * For KDBUS_POLICY_ACCESS_GROUP, the gid + */ +struct kdbus_policy_access { + __u64 type; /* USER, GROUP, WORLD */ + __u64 access; /* OWN, TALK, SEE */ + __u64 id; /* uid, gid, 0 */ +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_attach_flags - flags for metadata attachments + * @KDBUS_ATTACH_TIMESTAMP: Timestamp + * @KDBUS_ATTACH_CREDS: Credentials + * @KDBUS_ATTACH_PIDS: PIDs + * @KDBUS_ATTACH_AUXGROUPS: Auxiliary groups + * @KDBUS_ATTACH_NAMES: Well-known names + * @KDBUS_ATTACH_TID_COMM: The "comm" process identifier of the TID + * @KDBUS_ATTACH_PID_COMM: The "comm" process identifier of the PID + * @KDBUS_ATTACH_EXE: The path of the executable + * @KDBUS_ATTACH_CMDLINE: The process command line + * @KDBUS_ATTACH_CGROUP: The croup membership + * @KDBUS_ATTACH_CAPS: The process capabilities + * @KDBUS_ATTACH_SECLABEL: The security label + * @KDBUS_ATTACH_AUDIT: The audit IDs + * @KDBUS_ATTACH_CONN_DESCRIPTION: The human-readable connection name + * @_KDBUS_ATTACH_ALL: All of the above + * @_KDBUS_ATTACH_ANY: Wildcard match to enable any kind of + * metatdata. + */ +enum kdbus_attach_flags { + KDBUS_ATTACH_TIMESTAMP = 1ULL << 0, + KDBUS_ATTACH_CREDS = 1ULL << 1, + KDBUS_ATTACH_PIDS = 1ULL << 2, + KDBUS_ATTACH_AUXGROUPS = 1ULL << 3, + KDBUS_ATTACH_NAMES = 1ULL << 4, + KDBUS_ATTACH_TID_COMM = 1ULL << 5, + KDBUS_ATTACH_PID_COMM = 1ULL << 6, + KDBUS_ATTACH_EXE = 1ULL << 7, + KDBUS_ATTACH_CMDLINE = 1ULL << 8, + KDBUS_ATTACH_CGROUP = 1ULL << 9, + KDBUS_ATTACH_CAPS = 1ULL << 10, + KDBUS_ATTACH_SECLABEL = 1ULL << 11, + KDBUS_ATTACH_AUDIT = 1ULL << 12, + KDBUS_ATTACH_CONN_DESCRIPTION = 1ULL << 13, + _KDBUS_ATTACH_ALL = (1ULL << 14) - 1, + _KDBUS_ATTACH_ANY = ~0ULL +}; + +/** + * enum kdbus_item_type - item types to chain data in a list + * @_KDBUS_ITEM_NULL: Uninitialized/invalid + * @_KDBUS_ITEM_USER_BASE: Start of user items + * @KDBUS_ITEM_NEGOTIATE: Negotiate supported items + * @KDBUS_ITEM_PAYLOAD_VEC: Vector to data + * @KDBUS_ITEM_PAYLOAD_OFF: Data at returned offset to message head + * @KDBUS_ITEM_PAYLOAD_MEMFD: Data as sealed memfd + * @KDBUS_ITEM_FDS: Attached file descriptors + * @KDBUS_ITEM_CANCEL_FD: FD used to cancel a synchronous + * operation by writing to it from + * userspace + * @KDBUS_ITEM_BLOOM_PARAMETER: Bus-wide bloom parameters, used with + * KDBUS_CMD_BUS_MAKE, carries a + * struct kdbus_bloom_parameter + * @KDBUS_ITEM_BLOOM_FILTER: Bloom filter carried with a message, + * used to match against a bloom mask of a + * connection, carries a struct + * kdbus_bloom_filter + * @KDBUS_ITEM_BLOOM_MASK: Bloom mask used to match against a + * message'sbloom filter + * @KDBUS_ITEM_DST_NAME: Destination's well-known name + * @KDBUS_ITEM_MAKE_NAME: Name of domain, bus, endpoint + * @KDBUS_ITEM_ATTACH_FLAGS_SEND: Attach-flags, used for updating which + * metadata a connection opts in to send + * @KDBUS_ITEM_ATTACH_FLAGS_RECV: Attach-flags, used for updating which + * metadata a connection requests to + * receive for each reeceived message + * @KDBUS_ITEM_ID: Connection ID + * @KDBUS_ITEM_NAME: Well-know name with flags + * @_KDBUS_ITEM_ATTACH_BASE: Start of metadata attach items + * @KDBUS_ITEM_TIMESTAMP: Timestamp + * @KDBUS_ITEM_CREDS: Process credentials + * @KDBUS_ITEM_PIDS: Process identifiers + * @KDBUS_ITEM_AUXGROUPS: Auxiliary process groups + * @KDBUS_ITEM_OWNED_NAME: A name owned by the associated + * connection + * @KDBUS_ITEM_TID_COMM: Thread ID "comm" identifier + * (Don't trust this, see below.) + * @KDBUS_ITEM_PID_COMM: Process ID "comm" identifier + * (Don't trust this, see below.) + * @KDBUS_ITEM_EXE: The path of the executable + * (Don't trust this, see below.) + * @KDBUS_ITEM_CMDLINE: The process command line + * (Don't trust this, see below.) + * @KDBUS_ITEM_CGROUP: The croup membership + * @KDBUS_ITEM_CAPS: The process capabilities + * @KDBUS_ITEM_SECLABEL: The security label + * @KDBUS_ITEM_AUDIT: The audit IDs + * @KDBUS_ITEM_CONN_DESCRIPTION: The connection's human-readable name + * (debugging) + * @_KDBUS_ITEM_POLICY_BASE: Start of policy items + * @KDBUS_ITEM_POLICY_ACCESS: Policy access block + * @_KDBUS_ITEM_KERNEL_BASE: Start of kernel-generated message items + * @KDBUS_ITEM_NAME_ADD: Notification in kdbus_notify_name_change + * @KDBUS_ITEM_NAME_REMOVE: Notification in kdbus_notify_name_change + * @KDBUS_ITEM_NAME_CHANGE: Notification in kdbus_notify_name_change + * @KDBUS_ITEM_ID_ADD: Notification in kdbus_notify_id_change + * @KDBUS_ITEM_ID_REMOVE: Notification in kdbus_notify_id_change + * @KDBUS_ITEM_REPLY_TIMEOUT: Timeout has been reached + * @KDBUS_ITEM_REPLY_DEAD: Destination died + * + * N.B: The process and thread COMM fields, as well as the CMDLINE and + * EXE fields may be altered by unprivileged processes und should + * hence *not* used for security decisions. Peers should make use of + * these items only for informational purposes, such as generating log + * records. + */ +enum kdbus_item_type { + _KDBUS_ITEM_NULL, + _KDBUS_ITEM_USER_BASE, + KDBUS_ITEM_NEGOTIATE = _KDBUS_ITEM_USER_BASE, + KDBUS_ITEM_PAYLOAD_VEC, + KDBUS_ITEM_PAYLOAD_OFF, + KDBUS_ITEM_PAYLOAD_MEMFD, + KDBUS_ITEM_FDS, + KDBUS_ITEM_CANCEL_FD, + KDBUS_ITEM_BLOOM_PARAMETER, + KDBUS_ITEM_BLOOM_FILTER, + KDBUS_ITEM_BLOOM_MASK, + KDBUS_ITEM_DST_NAME, + KDBUS_ITEM_MAKE_NAME, + KDBUS_ITEM_ATTACH_FLAGS_SEND, + KDBUS_ITEM_ATTACH_FLAGS_RECV, + KDBUS_ITEM_ID, + KDBUS_ITEM_NAME, + KDBUS_ITEM_DST_ID, + + /* keep these item types in sync with KDBUS_ATTACH_* flags */ + _KDBUS_ITEM_ATTACH_BASE = 0x1000, + KDBUS_ITEM_TIMESTAMP = _KDBUS_ITEM_ATTACH_BASE, + KDBUS_ITEM_CREDS, + KDBUS_ITEM_PIDS, + KDBUS_ITEM_AUXGROUPS, + KDBUS_ITEM_OWNED_NAME, + KDBUS_ITEM_TID_COMM, + KDBUS_ITEM_PID_COMM, + KDBUS_ITEM_EXE, + KDBUS_ITEM_CMDLINE, + KDBUS_ITEM_CGROUP, + KDBUS_ITEM_CAPS, + KDBUS_ITEM_SECLABEL, + KDBUS_ITEM_AUDIT, + KDBUS_ITEM_CONN_DESCRIPTION, + + _KDBUS_ITEM_POLICY_BASE = 0x2000, + KDBUS_ITEM_POLICY_ACCESS = _KDBUS_ITEM_POLICY_BASE, + + _KDBUS_ITEM_KERNEL_BASE = 0x8000, + KDBUS_ITEM_NAME_ADD = _KDBUS_ITEM_KERNEL_BASE, + KDBUS_ITEM_NAME_REMOVE, + KDBUS_ITEM_NAME_CHANGE, + KDBUS_ITEM_ID_ADD, + KDBUS_ITEM_ID_REMOVE, + KDBUS_ITEM_REPLY_TIMEOUT, + KDBUS_ITEM_REPLY_DEAD, +}; + +/** + * struct kdbus_item - chain of data blocks + * @size: Overall data record size + * @type: Kdbus_item type of data + * @data: Generic bytes + * @data32: Generic 32 bit array + * @data64: Generic 64 bit array + * @str: Generic string + * @id: Connection ID + * @vec: KDBUS_ITEM_PAYLOAD_VEC + * @creds: KDBUS_ITEM_CREDS + * @audit: KDBUS_ITEM_AUDIT + * @timestamp: KDBUS_ITEM_TIMESTAMP + * @name: KDBUS_ITEM_NAME + * @bloom_parameter: KDBUS_ITEM_BLOOM_PARAMETER + * @bloom_filter: KDBUS_ITEM_BLOOM_FILTER + * @memfd: KDBUS_ITEM_PAYLOAD_MEMFD + * @name_change: KDBUS_ITEM_NAME_ADD + * KDBUS_ITEM_NAME_REMOVE + * KDBUS_ITEM_NAME_CHANGE + * @id_change: KDBUS_ITEM_ID_ADD + * KDBUS_ITEM_ID_REMOVE + * @policy: KDBUS_ITEM_POLICY_ACCESS + */ +struct kdbus_item { + __u64 size; + __u64 type; + union { + __u8 data[0]; + __u32 data32[0]; + __u64 data64[0]; + char str[0]; + + __u64 id; + struct kdbus_vec vec; + struct kdbus_creds creds; + struct kdbus_pids pids; + struct kdbus_audit audit; + struct kdbus_caps caps; + struct kdbus_timestamp timestamp; + struct kdbus_name name; + struct kdbus_bloom_parameter bloom_parameter; + struct kdbus_bloom_filter bloom_filter; + struct kdbus_memfd memfd; + int fds[0]; + struct kdbus_notify_name_change name_change; + struct kdbus_notify_id_change id_change; + struct kdbus_policy_access policy_access; + }; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_msg_flags - type of message + * @KDBUS_MSG_EXPECT_REPLY: Expect a reply message, used for + * method calls. The userspace-supplied + * cookie identifies the message and the + * respective reply carries the cookie + * in cookie_reply + * @KDBUS_MSG_NO_AUTO_START: Do not start a service if the addressed + * name is not currently active. This flag is + * not looked at by the kernel but only + * serves as hint for userspace implementations. + * @KDBUS_MSG_SIGNAL: Treat this message as signal + */ +enum kdbus_msg_flags { + KDBUS_MSG_EXPECT_REPLY = 1ULL << 0, + KDBUS_MSG_NO_AUTO_START = 1ULL << 1, + KDBUS_MSG_SIGNAL = 1ULL << 2, +}; + +/** + * enum kdbus_payload_type - type of payload carried by message + * @KDBUS_PAYLOAD_KERNEL: Kernel-generated simple message + * @KDBUS_PAYLOAD_DBUS: D-Bus marshalling "DBusDBus" + * + * Any payload-type is accepted. Common types will get added here once + * established. + */ +enum kdbus_payload_type { + KDBUS_PAYLOAD_KERNEL, + KDBUS_PAYLOAD_DBUS = 0x4442757344427573ULL, +}; + +/** + * struct kdbus_msg - the representation of a kdbus message + * @size: Total size of the message + * @flags: Message flags (KDBUS_MSG_*), userspace → kernel + * @priority: Message queue priority value + * @dst_id: 64-bit ID of the destination connection + * @src_id: 64-bit ID of the source connection + * @payload_type: Payload type (KDBUS_PAYLOAD_*) + * @cookie: Userspace-supplied cookie, for the connection + * to identify its messages + * @timeout_ns: The time to wait for a message reply from the peer. + * If there is no reply, and the send command is + * executed asynchronously, a kernel-generated message + * with an attached KDBUS_ITEM_REPLY_TIMEOUT item + * is sent to @src_id. For synchronously executed send + * command, the value denotes the maximum time the call + * blocks to wait for a reply. The timeout is expected in + * nanoseconds and as absolute CLOCK_MONOTONIC value. + * @cookie_reply: A reply to the requesting message with the same + * cookie. The requesting connection can match its + * request and the reply with this value + * @items: A list of kdbus_items containing the message payload + */ +struct kdbus_msg { + __u64 size; + __u64 flags; + __s64 priority; + __u64 dst_id; + __u64 src_id; + __u64 payload_type; + __u64 cookie; + union { + __u64 timeout_ns; + __u64 cookie_reply; + }; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_msg_info - returned message container + * @offset: Offset of kdbus_msg slice in pool + * @msg_size: Copy of the kdbus_msg.size field + * @return_flags: Command return flags, kernel → userspace + */ +struct kdbus_msg_info { + __u64 offset; + __u64 msg_size; + __u64 return_flags; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_send_flags - flags for sending messages + * @KDBUS_SEND_SYNC_REPLY: Wait for destination connection to + * reply to this message. The + * KDBUS_CMD_SEND ioctl() will block + * until the reply is received, and + * reply in struct kdbus_cmd_send will + * yield the offset in the sender's pool + * where the reply can be found. + * This flag is only valid if + * @KDBUS_MSG_EXPECT_REPLY is set as well. + */ +enum kdbus_send_flags { + KDBUS_SEND_SYNC_REPLY = 1ULL << 0, +}; + +/** + * struct kdbus_cmd_send - send message + * @size: Overall size of this structure + * @flags: Flags to change send behavior (KDBUS_SEND_*) + * @return_flags: Command return flags, kernel → userspace + * @msg_address: Storage address of the kdbus_msg to send + * @reply: Storage for message reply if KDBUS_SEND_SYNC_REPLY + * was given + * @items: Additional items for this command + */ +struct kdbus_cmd_send { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 msg_address; + struct kdbus_msg_info reply; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_recv_flags - flags for de-queuing messages + * @KDBUS_RECV_PEEK: Return the next queued message without + * actually de-queuing it, and without installing + * any file descriptors or other resources. It is + * usually used to determine the activating + * connection of a bus name. + * @KDBUS_RECV_DROP: Drop and free the next queued message and all + * its resources without actually receiving it. + * @KDBUS_RECV_USE_PRIORITY: Only de-queue messages with the specified or + * higher priority (lowest values); if not set, + * the priority value is ignored. + */ +enum kdbus_recv_flags { + KDBUS_RECV_PEEK = 1ULL << 0, + KDBUS_RECV_DROP = 1ULL << 1, + KDBUS_RECV_USE_PRIORITY = 1ULL << 2, +}; + +/** + * enum kdbus_recv_return_flags - return flags for message receive commands + * @KDBUS_RECV_RETURN_INCOMPLETE_FDS: One or more file descriptors could not + * be installed. These descriptors in + * KDBUS_ITEM_FDS will carry the value -1. + * @KDBUS_RECV_RETURN_DROPPED_MSGS: There have been dropped messages since + * the last time a message was received. + * The 'dropped_msgs' counter contains the + * number of messages dropped pool + * overflows or other missed broadcasts. + */ +enum kdbus_recv_return_flags { + KDBUS_RECV_RETURN_INCOMPLETE_FDS = 1ULL << 0, + KDBUS_RECV_RETURN_DROPPED_MSGS = 1ULL << 1, +}; + +/** + * struct kdbus_cmd_recv - struct to de-queue a buffered message + * @size: Overall size of this object + * @flags: KDBUS_RECV_* flags, userspace → kernel + * @return_flags: Command return flags, kernel → userspace + * @priority: Minimum priority of the messages to de-queue. Lowest + * values have the highest priority. + * @dropped_msgs: In case there were any dropped messages since the last + * time a message was received, this will be set to the + * number of lost messages and + * KDBUS_RECV_RETURN_DROPPED_MSGS will be set in + * 'return_flags'. This can only happen if the ioctl + * returns 0 or EAGAIN. + * @msg: Return storage for received message. + * @items: Additional items for this command. + * + * This struct is used with the KDBUS_CMD_RECV ioctl. + */ +struct kdbus_cmd_recv { + __u64 size; + __u64 flags; + __u64 return_flags; + __s64 priority; + __u64 dropped_msgs; + struct kdbus_msg_info msg; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_cmd_free - struct to free a slice of memory in the pool + * @size: Overall size of this structure + * @flags: Flags for the free command, userspace → kernel + * @return_flags: Command return flags, kernel → userspace + * @offset: The offset of the memory slice, as returned by other + * ioctls + * @items: Additional items to modify the behavior + * + * This struct is used with the KDBUS_CMD_FREE ioctl. + */ +struct kdbus_cmd_free { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 offset; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_hello_flags - flags for struct kdbus_cmd_hello + * @KDBUS_HELLO_ACCEPT_FD: The connection allows the reception of + * any passed file descriptors + * @KDBUS_HELLO_ACTIVATOR: Special-purpose connection which registers + * a well-know name for a process to be started + * when traffic arrives + * @KDBUS_HELLO_POLICY_HOLDER: Special-purpose connection which registers + * policy entries for a name. The provided name + * is not activated and not registered with the + * name database, it only allows unprivileged + * connections to acquire a name, talk or discover + * a service + * @KDBUS_HELLO_MONITOR: Special-purpose connection to monitor + * bus traffic + */ +enum kdbus_hello_flags { + KDBUS_HELLO_ACCEPT_FD = 1ULL << 0, + KDBUS_HELLO_ACTIVATOR = 1ULL << 1, + KDBUS_HELLO_POLICY_HOLDER = 1ULL << 2, + KDBUS_HELLO_MONITOR = 1ULL << 3, +}; + +/** + * struct kdbus_cmd_hello - struct to say hello to kdbus + * @size: The total size of the structure + * @flags: Connection flags (KDBUS_HELLO_*), userspace → kernel + * @return_flags: Command return flags, kernel → userspace + * @attach_flags_send: Mask of metadata to attach to each message sent + * off by this connection (KDBUS_ATTACH_*) + * @attach_flags_recv: Mask of metadata to attach to each message receieved + * by the new connection (KDBUS_ATTACH_*) + * @bus_flags: The flags field copied verbatim from the original + * KDBUS_CMD_BUS_MAKE ioctl. It's intended to be useful + * to do negotiation of features of the payload that is + * transferred (kernel → userspace) + * @id: The ID of this connection (kernel → userspace) + * @pool_size: Size of the connection's buffer where the received + * messages are placed + * @offset: Pool offset where items are returned to report + * additional information about the bus and the newly + * created connection. + * @items_size: Size of buffer returned in the pool slice at @offset. + * @id128: Unique 128-bit ID of the bus (kernel → userspace) + * @items: A list of items + * + * This struct is used with the KDBUS_CMD_HELLO ioctl. + */ +struct kdbus_cmd_hello { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 attach_flags_send; + __u64 attach_flags_recv; + __u64 bus_flags; + __u64 id; + __u64 pool_size; + __u64 offset; + __u64 items_size; + __u8 id128[16]; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_info - connection information + * @size: total size of the struct + * @id: 64bit object ID + * @flags: object creation flags + * @items: list of items + * + * Note that the user is responsible for freeing the allocated memory with + * the KDBUS_CMD_FREE ioctl. + */ +struct kdbus_info { + __u64 size; + __u64 id; + __u64 flags; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_list_flags - what to include into the returned list + * @KDBUS_LIST_UNIQUE: active connections + * @KDBUS_LIST_ACTIVATORS: activator connections + * @KDBUS_LIST_NAMES: known well-known names + * @KDBUS_LIST_QUEUED: queued-up names + */ +enum kdbus_list_flags { + KDBUS_LIST_UNIQUE = 1ULL << 0, + KDBUS_LIST_NAMES = 1ULL << 1, + KDBUS_LIST_ACTIVATORS = 1ULL << 2, + KDBUS_LIST_QUEUED = 1ULL << 3, +}; + +/** + * struct kdbus_cmd_list - list connections + * @size: overall size of this object + * @flags: flags for the query (KDBUS_LIST_*), userspace → kernel + * @return_flags: command return flags, kernel → userspace + * @offset: Offset in the caller's pool buffer where an array of + * kdbus_info objects is stored. + * The user must use KDBUS_CMD_FREE to free the + * allocated memory. + * @list_size: size of returned list in bytes + * @items: Items for the command. Reserved for future use. + * + * This structure is used with the KDBUS_CMD_LIST ioctl. + */ +struct kdbus_cmd_list { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 offset; + __u64 list_size; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * struct kdbus_cmd_info - struct used for KDBUS_CMD_CONN_INFO ioctl + * @size: The total size of the struct + * @flags: Flags for this ioctl, userspace → kernel + * @return_flags: Command return flags, kernel → userspace + * @id: The 64-bit ID of the connection. If set to zero, passing + * @name is required. kdbus will look up the name to + * determine the ID in this case. + * @attach_flags: Set of attach flags to specify the set of information + * to receive, userspace → kernel + * @offset: Returned offset in the caller's pool buffer where the + * kdbus_info struct result is stored. The user must + * use KDBUS_CMD_FREE to free the allocated memory. + * @info_size: Output buffer to report size of data at @offset. + * @items: The optional item list, containing the + * well-known name to look up as a KDBUS_ITEM_NAME. + * Only needed in case @id is zero. + * + * On success, the KDBUS_CMD_CONN_INFO ioctl will return 0 and @offset will + * tell the user the offset in the connection pool buffer at which to find the + * result in a struct kdbus_info. + */ +struct kdbus_cmd_info { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 id; + __u64 attach_flags; + __u64 offset; + __u64 info_size; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_cmd_match_flags - flags to control the KDBUS_CMD_MATCH_ADD ioctl + * @KDBUS_MATCH_REPLACE: If entries with the supplied cookie already + * exists, remove them before installing the new + * matches. + */ +enum kdbus_cmd_match_flags { + KDBUS_MATCH_REPLACE = 1ULL << 0, +}; + +/** + * struct kdbus_cmd_match - struct to add or remove matches + * @size: The total size of the struct + * @flags: Flags for match command (KDBUS_MATCH_*), + * userspace → kernel + * @return_flags: Command return flags, kernel → userspace + * @cookie: Userspace supplied cookie. When removing, the cookie + * identifies the match to remove + * @items: A list of items for additional information + * + * This structure is used with the KDBUS_CMD_MATCH_ADD and + * KDBUS_CMD_MATCH_REMOVE ioctl. + */ +struct kdbus_cmd_match { + __u64 size; + __u64 flags; + __u64 return_flags; + __u64 cookie; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * enum kdbus_make_flags - Flags for KDBUS_CMD_{BUS,ENDPOINT}_MAKE + * @KDBUS_MAKE_ACCESS_GROUP: Make the bus or endpoint node group-accessible + * @KDBUS_MAKE_ACCESS_WORLD: Make the bus or endpoint node world-accessible + */ +enum kdbus_make_flags { + KDBUS_MAKE_ACCESS_GROUP = 1ULL << 0, + KDBUS_MAKE_ACCESS_WORLD = 1ULL << 1, +}; + +/** + * enum kdbus_name_flags - flags for KDBUS_CMD_NAME_ACQUIRE + * @KDBUS_NAME_REPLACE_EXISTING: Try to replace name of other connections + * @KDBUS_NAME_ALLOW_REPLACEMENT: Allow the replacement of the name + * @KDBUS_NAME_QUEUE: Name should be queued if busy + * @KDBUS_NAME_IN_QUEUE: Name is queued + * @KDBUS_NAME_ACTIVATOR: Name is owned by a activator connection + * @KDBUS_NAME_PRIMARY: Primary owner of the name + * @KDBUS_NAME_ACQUIRED: Name was acquired/queued _now_ + */ +enum kdbus_name_flags { + KDBUS_NAME_REPLACE_EXISTING = 1ULL << 0, + KDBUS_NAME_ALLOW_REPLACEMENT = 1ULL << 1, + KDBUS_NAME_QUEUE = 1ULL << 2, + KDBUS_NAME_IN_QUEUE = 1ULL << 3, + KDBUS_NAME_ACTIVATOR = 1ULL << 4, + KDBUS_NAME_PRIMARY = 1ULL << 5, + KDBUS_NAME_ACQUIRED = 1ULL << 6, +}; + +/** + * struct kdbus_cmd - generic ioctl payload + * @size: Overall size of this structure + * @flags: Flags for this ioctl, userspace → kernel + * @return_flags: Ioctl return flags, kernel → userspace + * @items: Additional items to modify the behavior + * + * This is a generic ioctl payload object. It's used by all ioctls that only + * take flags and items as input. + */ +struct kdbus_cmd { + __u64 size; + __u64 flags; + __u64 return_flags; + struct kdbus_item items[0]; +} __attribute__((__aligned__(8))); + +/** + * Ioctl API + * + * KDBUS_CMD_BUS_MAKE: After opening the "control" node, this command + * creates a new bus with the specified + * name. The bus is immediately shut down and + * cleaned up when the opened file descriptor is + * closed. + * + * KDBUS_CMD_ENDPOINT_MAKE: Creates a new named special endpoint to talk to + * the bus. Such endpoints usually carry a more + * restrictive policy and grant restricted access + * to specific applications. + * KDBUS_CMD_ENDPOINT_UPDATE: Update the properties of a custom enpoint. Used + * to update the policy. + * + * KDBUS_CMD_HELLO: By opening the bus node, a connection is + * created. After a HELLO the opened connection + * becomes an active peer on the bus. + * KDBUS_CMD_UPDATE: Update the properties of a connection. Used to + * update the metadata subscription mask and + * policy. + * KDBUS_CMD_BYEBYE: Disconnect a connection. If there are no + * messages queued up in the connection's pool, + * the call succeeds, and the handle is rendered + * unusable. Otherwise, -EBUSY is returned without + * any further side-effects. + * KDBUS_CMD_FREE: Release the allocated memory in the receiver's + * pool. + * KDBUS_CMD_CONN_INFO: Retrieve credentials and properties of the + * initial creator of the connection. The data was + * stored at registration time and does not + * necessarily represent the connected process or + * the actual state of the process. + * KDBUS_CMD_BUS_CREATOR_INFO: Retrieve information of the creator of the bus + * a connection is attached to. + * + * KDBUS_CMD_SEND: Send a message and pass data from userspace to + * the kernel. + * KDBUS_CMD_RECV: Receive a message from the kernel which is + * placed in the receiver's pool. + * + * KDBUS_CMD_NAME_ACQUIRE: Request a well-known bus name to associate with + * the connection. Well-known names are used to + * address a peer on the bus. + * KDBUS_CMD_NAME_RELEASE: Release a well-known name the connection + * currently owns. + * KDBUS_CMD_LIST: Retrieve the list of all currently registered + * well-known and unique names. + * + * KDBUS_CMD_MATCH_ADD: Install a match which broadcast messages should + * be delivered to the connection. + * KDBUS_CMD_MATCH_REMOVE: Remove a current match for broadcast messages. + */ +enum kdbus_ioctl_type { + /* bus owner (00-0f) */ + KDBUS_CMD_BUS_MAKE = _IOW(KDBUS_IOCTL_MAGIC, 0x00, + struct kdbus_cmd), + + /* endpoint owner (10-1f) */ + KDBUS_CMD_ENDPOINT_MAKE = _IOW(KDBUS_IOCTL_MAGIC, 0x10, + struct kdbus_cmd), + KDBUS_CMD_ENDPOINT_UPDATE = _IOW(KDBUS_IOCTL_MAGIC, 0x11, + struct kdbus_cmd), + + /* connection owner (80-ff) */ + KDBUS_CMD_HELLO = _IOWR(KDBUS_IOCTL_MAGIC, 0x80, + struct kdbus_cmd_hello), + KDBUS_CMD_UPDATE = _IOW(KDBUS_IOCTL_MAGIC, 0x81, + struct kdbus_cmd), + KDBUS_CMD_BYEBYE = _IOW(KDBUS_IOCTL_MAGIC, 0x82, + struct kdbus_cmd), + KDBUS_CMD_FREE = _IOW(KDBUS_IOCTL_MAGIC, 0x83, + struct kdbus_cmd_free), + KDBUS_CMD_CONN_INFO = _IOR(KDBUS_IOCTL_MAGIC, 0x84, + struct kdbus_cmd_info), + KDBUS_CMD_BUS_CREATOR_INFO = _IOR(KDBUS_IOCTL_MAGIC, 0x85, + struct kdbus_cmd_info), + KDBUS_CMD_LIST = _IOR(KDBUS_IOCTL_MAGIC, 0x86, + struct kdbus_cmd_list), + + KDBUS_CMD_SEND = _IOW(KDBUS_IOCTL_MAGIC, 0x90, + struct kdbus_cmd_send), + KDBUS_CMD_RECV = _IOR(KDBUS_IOCTL_MAGIC, 0x91, + struct kdbus_cmd_recv), + + KDBUS_CMD_NAME_ACQUIRE = _IOW(KDBUS_IOCTL_MAGIC, 0xa0, + struct kdbus_cmd), + KDBUS_CMD_NAME_RELEASE = _IOW(KDBUS_IOCTL_MAGIC, 0xa1, + struct kdbus_cmd), + + KDBUS_CMD_MATCH_ADD = _IOW(KDBUS_IOCTL_MAGIC, 0xb0, + struct kdbus_cmd_match), + KDBUS_CMD_MATCH_REMOVE = _IOW(KDBUS_IOCTL_MAGIC, 0xb1, + struct kdbus_cmd_match), +}; + +#endif /* _UAPI_KDBUS_H_ */ diff -Nur linux-4.2/include/uapi/linux/magic.h linux-4.2-pck/include/uapi/linux/magic.h --- linux-4.2/include/uapi/linux/magic.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/uapi/linux/magic.h 2015-09-08 00:48:22.231697056 -0300 @@ -76,4 +76,6 @@ #define BTRFS_TEST_MAGIC 0x73727279 #define NSFS_MAGIC 0x6e736673 +#define KDBUS_SUPER_MAGIC 0x44427573 + #endif /* __LINUX_MAGIC_H__ */ diff -Nur linux-4.2/include/uapi/linux/netlink.h linux-4.2-pck/include/uapi/linux/netlink.h --- linux-4.2/include/uapi/linux/netlink.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/uapi/linux/netlink.h 2015-09-08 11:20:32.353684177 -0300 @@ -27,6 +27,8 @@ #define NETLINK_ECRYPTFS 19 #define NETLINK_RDMA 20 #define NETLINK_CRYPTO 21 /* Crypto layer */ +#define NETLINK_TOI_USERUI 22 /* TuxOnIce's userui */ +#define NETLINK_TOI_USM 23 /* Userspace storage manager */ #define NETLINK_INET_DIAG NETLINK_SOCK_DIAG diff -Nur linux-4.2/include/uapi/linux/tcp.h linux-4.2-pck/include/uapi/linux/tcp.h --- linux-4.2/include/uapi/linux/tcp.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/uapi/linux/tcp.h 2015-09-08 00:48:31.501257347 -0300 @@ -115,6 +115,9 @@ #define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ #define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */ #define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */ +#define TCP_STEALTH 29 +#define TCP_STEALTH_INTEGRITY 30 +#define TCP_STEALTH_INTEGRITY_LEN 31 struct tcp_repair_opt { __u32 opt_code; diff -Nur linux-4.2/include/uapi/linux/vt.h linux-4.2-pck/include/uapi/linux/vt.h --- linux-4.2/include/uapi/linux/vt.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/include/uapi/linux/vt.h 2015-09-08 00:48:22.231697056 -0300 @@ -3,12 +3,26 @@ /* + * We will make this definition solely for the purpose of making packages + * such as splashutils build, because they can not understand that + * NR_TTY_DEVICES is defined in the kernel configuration. + */ +#ifndef CONFIG_NR_TTY_DEVICES +#define CONFIG_NR_TTY_DEVICES 63 +#endif + +/* * These constants are also useful for user-level apps (e.g., VC * resizing). */ #define MIN_NR_CONSOLES 1 /* must be at least 1 */ -#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ -#define MAX_NR_USER_CONSOLES 63 /* must be root to allocate above this */ + +/* + * NR_TTY_DEVICES: + * Value MUST be at least 11 and must never be higher then 63 + */ +#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ +#define MAX_NR_USER_CONSOLES CONFIG_NR_TTY_DEVICES /* must be root to allocate above this */ /* Note: the ioctl VT_GETSTATE does not work for consoles 16 and higher (since it returns a short) */ diff -Nur linux-4.2/init/Kconfig linux-4.2-pck/init/Kconfig --- linux-4.2/init/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/init/Kconfig 2015-09-08 00:48:22.231697056 -0300 @@ -28,6 +28,26 @@ menu "General setup" +config PCK_INTERACTIVE + bool "Tune kernel for interactivity" + default y + help + Tunes the kernel for responsiveness at the cost of throughput and power usage. + + --- VM --- + Mem dirty before bg writeback..: 10 % -> 128 MiB + Mem dirty before sync writeback: 20 % -> 256 MiB + + --- CPU Scheduler --- + Scheduling latency.............: 6 -> 3 ms + Minimal granularity............: 0.75 -> 0.3 ms + Wakeup granularity.............: 1 -> 0.5 ms + CPU migration cost.............: 0.5 -> 0.25 ms + Bandwidth slice size...........: 5 -> 3 ms + + --- CPU Frequency Scaling --- + Ondemand down scaling factor...: 1 -> 10 + config BROKEN bool @@ -261,6 +281,19 @@ depends on SYSCTL default y +config KDBUS + tristate "kdbus interprocess communication" + depends on TMPFS + help + D-Bus is a system for low-latency, low-overhead, easy to use + interprocess communication (IPC). + + See the man-pages and HTML files in Documentation/kdbus/ + that are generated by 'make mandocs' and 'make htmldocs'. + + If you have an ordinary machine, select M here. The module + will be called kdbus. + config CROSS_MEMORY_ATTACH bool "Enable process_vm_readv/writev syscalls" depends on MMU @@ -1137,7 +1170,7 @@ endif # CGROUPS config CHECKPOINT_RESTORE - bool "Checkpoint/restore support" if EXPERT + bool "Checkpoint/restore support" select PROC_CHILDREN default n help diff -Nur linux-4.2/init/do_mounts.c linux-4.2-pck/init/do_mounts.c --- linux-4.2/init/do_mounts.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/init/do_mounts.c 2015-09-08 11:20:32.383682710 -0300 @@ -597,6 +597,8 @@ if (is_floppy && rd_doload && rd_load_disk(0)) ROOT_DEV = Root_RAM0; + check_resume_attempted(); + mount_root(); out: devtmpfs_mount("dev"); diff -Nur linux-4.2/init/do_mounts_initrd.c linux-4.2-pck/init/do_mounts_initrd.c --- linux-4.2/init/do_mounts_initrd.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/init/do_mounts_initrd.c 2015-09-08 11:20:32.393682222 -0300 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -79,6 +80,11 @@ current->flags &= ~PF_FREEZER_SKIP; + if (!resume_attempted) + printk(KERN_ERR "TuxOnIce: No attempt was made to resume from " + "any image that might exist.\n"); + clear_toi_state(TOI_BOOT_TIME); + /* move initrd to rootfs' /old */ sys_mount("..", ".", NULL, MS_MOVE, NULL); /* switch root and cwd back to / of rootfs */ diff -Nur linux-4.2/ipc/Makefile linux-4.2-pck/ipc/Makefile --- linux-4.2/ipc/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/ipc/Makefile 2015-09-08 00:48:22.231697056 -0300 @@ -9,4 +9,4 @@ obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) obj-$(CONFIG_IPC_NS) += namespace.o obj-$(CONFIG_POSIX_MQUEUE_SYSCTL) += mq_sysctl.o - +obj-$(CONFIG_KDBUS) += kdbus/ diff -Nur linux-4.2/ipc/kdbus/Makefile linux-4.2-pck/ipc/kdbus/Makefile --- linux-4.2/ipc/kdbus/Makefile 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/Makefile 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,33 @@ +# +# By setting KDBUS_EXT=2, the kdbus module will be built as kdbus2.ko, and +# KBUILD_MODNAME=kdbus2. This has the effect that all exported objects have +# different names than usually (kdbus2fs, /sys/fs/kdbus2/) and you can run +# your test-infrastructure against the kdbus2.ko, while running your system +# on kdbus.ko. +# +# To just build the module, use: +# make KDBUS_EXT=2 M=ipc/kdbus +# + +kdbus$(KDBUS_EXT)-y := \ + bus.o \ + connection.o \ + endpoint.o \ + fs.o \ + handle.o \ + item.o \ + main.o \ + match.o \ + message.o \ + metadata.o \ + names.o \ + node.o \ + notify.o \ + domain.o \ + policy.o \ + pool.o \ + reply.o \ + queue.o \ + util.o + +obj-$(CONFIG_KDBUS) += kdbus$(KDBUS_EXT).o diff -Nur linux-4.2/ipc/kdbus/bus.c linux-4.2-pck/ipc/kdbus/bus.c --- linux-4.2/ipc/kdbus/bus.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/bus.c 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,514 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "notify.h" +#include "connection.h" +#include "domain.h" +#include "endpoint.h" +#include "handle.h" +#include "item.h" +#include "match.h" +#include "message.h" +#include "metadata.h" +#include "names.h" +#include "policy.h" +#include "util.h" + +static void kdbus_bus_free(struct kdbus_node *node) +{ + struct kdbus_bus *bus = container_of(node, struct kdbus_bus, node); + + WARN_ON(!list_empty(&bus->monitors_list)); + WARN_ON(!hash_empty(bus->conn_hash)); + + kdbus_notify_free(bus); + + kdbus_user_unref(bus->creator); + kdbus_name_registry_free(bus->name_registry); + kdbus_domain_unref(bus->domain); + kdbus_policy_db_clear(&bus->policy_db); + kdbus_meta_proc_unref(bus->creator_meta); + kfree(bus); +} + +static void kdbus_bus_release(struct kdbus_node *node, bool was_active) +{ + struct kdbus_bus *bus = container_of(node, struct kdbus_bus, node); + + if (was_active) + atomic_dec(&bus->creator->buses); +} + +static struct kdbus_bus *kdbus_bus_new(struct kdbus_domain *domain, + const char *name, + struct kdbus_bloom_parameter *bloom, + const u64 *pattach_owner, + u64 flags, kuid_t uid, kgid_t gid) +{ + struct kdbus_bus *b; + u64 attach_owner; + int ret; + + if (bloom->size < 8 || bloom->size > KDBUS_BUS_BLOOM_MAX_SIZE || + !KDBUS_IS_ALIGNED8(bloom->size) || bloom->n_hash < 1) + return ERR_PTR(-EINVAL); + + ret = kdbus_sanitize_attach_flags(pattach_owner ? *pattach_owner : 0, + &attach_owner); + if (ret < 0) + return ERR_PTR(ret); + + ret = kdbus_verify_uid_prefix(name, domain->user_namespace, uid); + if (ret < 0) + return ERR_PTR(ret); + + b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) + return ERR_PTR(-ENOMEM); + + kdbus_node_init(&b->node, KDBUS_NODE_BUS); + + b->node.free_cb = kdbus_bus_free; + b->node.release_cb = kdbus_bus_release; + b->node.uid = uid; + b->node.gid = gid; + b->node.mode = S_IRUSR | S_IXUSR; + + if (flags & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD)) + b->node.mode |= S_IRGRP | S_IXGRP; + if (flags & KDBUS_MAKE_ACCESS_WORLD) + b->node.mode |= S_IROTH | S_IXOTH; + + b->id = atomic64_inc_return(&domain->last_id); + b->bus_flags = flags; + b->attach_flags_owner = attach_owner; + generate_random_uuid(b->id128); + b->bloom = *bloom; + b->domain = kdbus_domain_ref(domain); + + kdbus_policy_db_init(&b->policy_db); + + init_rwsem(&b->conn_rwlock); + hash_init(b->conn_hash); + INIT_LIST_HEAD(&b->monitors_list); + + INIT_LIST_HEAD(&b->notify_list); + spin_lock_init(&b->notify_lock); + mutex_init(&b->notify_flush_lock); + + ret = kdbus_node_link(&b->node, &domain->node, name); + if (ret < 0) + goto exit_unref; + + /* cache the metadata/credentials of the creator */ + b->creator_meta = kdbus_meta_proc_new(); + if (IS_ERR(b->creator_meta)) { + ret = PTR_ERR(b->creator_meta); + b->creator_meta = NULL; + goto exit_unref; + } + + ret = kdbus_meta_proc_collect(b->creator_meta, + KDBUS_ATTACH_CREDS | + KDBUS_ATTACH_PIDS | + KDBUS_ATTACH_AUXGROUPS | + KDBUS_ATTACH_TID_COMM | + KDBUS_ATTACH_PID_COMM | + KDBUS_ATTACH_EXE | + KDBUS_ATTACH_CMDLINE | + KDBUS_ATTACH_CGROUP | + KDBUS_ATTACH_CAPS | + KDBUS_ATTACH_SECLABEL | + KDBUS_ATTACH_AUDIT); + if (ret < 0) + goto exit_unref; + + b->name_registry = kdbus_name_registry_new(); + if (IS_ERR(b->name_registry)) { + ret = PTR_ERR(b->name_registry); + b->name_registry = NULL; + goto exit_unref; + } + + /* + * Bus-limits of the creator are accounted on its real UID, just like + * all other per-user limits. + */ + b->creator = kdbus_user_lookup(domain, current_uid()); + if (IS_ERR(b->creator)) { + ret = PTR_ERR(b->creator); + b->creator = NULL; + goto exit_unref; + } + + return b; + +exit_unref: + kdbus_node_drain(&b->node); + kdbus_node_unref(&b->node); + return ERR_PTR(ret); +} + +/** + * kdbus_bus_ref() - increase the reference counter of a kdbus_bus + * @bus: The bus to reference + * + * Every user of a bus, except for its creator, must add a reference to the + * kdbus_bus using this function. + * + * Return: the bus itself + */ +struct kdbus_bus *kdbus_bus_ref(struct kdbus_bus *bus) +{ + if (bus) + kdbus_node_ref(&bus->node); + return bus; +} + +/** + * kdbus_bus_unref() - decrease the reference counter of a kdbus_bus + * @bus: The bus to unref + * + * Release a reference. If the reference count drops to 0, the bus will be + * freed. + * + * Return: NULL + */ +struct kdbus_bus *kdbus_bus_unref(struct kdbus_bus *bus) +{ + if (bus) + kdbus_node_unref(&bus->node); + return NULL; +} + +/** + * kdbus_bus_find_conn_by_id() - find a connection with a given id + * @bus: The bus to look for the connection + * @id: The 64-bit connection id + * + * Looks up a connection with a given id. The returned connection + * is ref'ed, and needs to be unref'ed by the user. Returns NULL if + * the connection can't be found. + */ +struct kdbus_conn *kdbus_bus_find_conn_by_id(struct kdbus_bus *bus, u64 id) +{ + struct kdbus_conn *conn, *found = NULL; + + down_read(&bus->conn_rwlock); + hash_for_each_possible(bus->conn_hash, conn, hentry, id) + if (conn->id == id) { + found = kdbus_conn_ref(conn); + break; + } + up_read(&bus->conn_rwlock); + + return found; +} + +/** + * kdbus_bus_broadcast() - send a message to all subscribed connections + * @bus: The bus the connections are connected to + * @conn_src: The source connection, may be %NULL for kernel notifications + * @staging: Staging object containing the message to send + * + * Send message to all connections that are currently active on the bus. + * Connections must still have matches installed in order to let the message + * pass. + * + * The caller must hold the name-registry lock of @bus. + */ +void kdbus_bus_broadcast(struct kdbus_bus *bus, + struct kdbus_conn *conn_src, + struct kdbus_staging *staging) +{ + struct kdbus_conn *conn_dst; + unsigned int i; + int ret; + + lockdep_assert_held(&bus->name_registry->rwlock); + + /* + * Make sure broadcast are queued on monitors before we send it out to + * anyone else. Otherwise, connections might react to broadcasts before + * the monitor gets the broadcast queued. In the worst case, the + * monitor sees a reaction to the broadcast before the broadcast itself. + * We don't give ordering guarantees across connections (and monitors + * can re-construct order via sequence numbers), but we should at least + * try to avoid re-ordering for monitors. + */ + kdbus_bus_eavesdrop(bus, conn_src, staging); + + down_read(&bus->conn_rwlock); + hash_for_each(bus->conn_hash, i, conn_dst, hentry) { + if (!kdbus_conn_is_ordinary(conn_dst)) + continue; + + /* + * Check if there is a match for the kmsg object in + * the destination connection match db + */ + if (!kdbus_match_db_match_msg(conn_dst->match_db, conn_src, + staging)) + continue; + + if (conn_src) { + /* + * Anyone can send broadcasts, as they have no + * destination. But a receiver needs TALK access to + * the sender in order to receive broadcasts. + */ + if (!kdbus_conn_policy_talk(conn_dst, NULL, conn_src)) + continue; + } else { + /* + * Check if there is a policy db that prevents the + * destination connection from receiving this kernel + * notification + */ + if (!kdbus_conn_policy_see_notification(conn_dst, NULL, + staging->msg)) + continue; + } + + ret = kdbus_conn_entry_insert(conn_src, conn_dst, staging, + NULL, NULL); + if (ret < 0) + kdbus_conn_lost_message(conn_dst); + } + up_read(&bus->conn_rwlock); +} + +/** + * kdbus_bus_eavesdrop() - send a message to all subscribed monitors + * @bus: The bus the monitors are connected to + * @conn_src: The source connection, may be %NULL for kernel notifications + * @staging: Staging object containing the message to send + * + * Send message to all monitors that are currently active on the bus. Monitors + * must still have matches installed in order to let the message pass. + * + * The caller must hold the name-registry lock of @bus. + */ +void kdbus_bus_eavesdrop(struct kdbus_bus *bus, + struct kdbus_conn *conn_src, + struct kdbus_staging *staging) +{ + struct kdbus_conn *conn_dst; + int ret; + + /* + * Monitor connections get all messages; ignore possible errors + * when sending messages to monitor connections. + */ + + lockdep_assert_held(&bus->name_registry->rwlock); + + down_read(&bus->conn_rwlock); + list_for_each_entry(conn_dst, &bus->monitors_list, monitor_entry) { + ret = kdbus_conn_entry_insert(conn_src, conn_dst, staging, + NULL, NULL); + if (ret < 0) + kdbus_conn_lost_message(conn_dst); + } + up_read(&bus->conn_rwlock); +} + +/** + * kdbus_cmd_bus_make() - handle KDBUS_CMD_BUS_MAKE + * @domain: domain to operate on + * @argp: command payload + * + * Return: NULL or newly created bus on success, ERR_PTR on failure. + */ +struct kdbus_bus *kdbus_cmd_bus_make(struct kdbus_domain *domain, + void __user *argp) +{ + struct kdbus_bus *bus = NULL; + struct kdbus_cmd *cmd; + struct kdbus_ep *ep = NULL; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_MAKE_NAME, .mandatory = true }, + { .type = KDBUS_ITEM_BLOOM_PARAMETER, .mandatory = true }, + { .type = KDBUS_ITEM_ATTACH_FLAGS_SEND }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_MAKE_ACCESS_GROUP | + KDBUS_MAKE_ACCESS_WORLD, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + + bus = kdbus_bus_new(domain, + argv[1].item->str, &argv[2].item->bloom_parameter, + argv[3].item ? argv[3].item->data64 : NULL, + cmd->flags, current_euid(), current_egid()); + if (IS_ERR(bus)) { + ret = PTR_ERR(bus); + bus = NULL; + goto exit; + } + + if (atomic_inc_return(&bus->creator->buses) > KDBUS_USER_MAX_BUSES) { + atomic_dec(&bus->creator->buses); + ret = -EMFILE; + goto exit; + } + + if (!kdbus_node_activate(&bus->node)) { + atomic_dec(&bus->creator->buses); + ret = -ESHUTDOWN; + goto exit; + } + + ep = kdbus_ep_new(bus, "bus", cmd->flags, bus->node.uid, bus->node.gid, + false); + if (IS_ERR(ep)) { + ret = PTR_ERR(ep); + ep = NULL; + goto exit; + } + + if (!kdbus_node_activate(&ep->node)) { + ret = -ESHUTDOWN; + goto exit; + } + + /* + * Drop our own reference, effectively causing the endpoint to be + * deactivated and released when the parent bus is. + */ + ep = kdbus_ep_unref(ep); + +exit: + ret = kdbus_args_clear(&args, ret); + if (ret < 0) { + if (ep) { + kdbus_node_drain(&ep->node); + kdbus_ep_unref(ep); + } + if (bus) { + kdbus_node_drain(&bus->node); + kdbus_bus_unref(bus); + } + return ERR_PTR(ret); + } + return bus; +} + +/** + * kdbus_cmd_bus_creator_info() - handle KDBUS_CMD_BUS_CREATOR_INFO + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_bus_creator_info(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_cmd_info *cmd; + struct kdbus_bus *bus = conn->ep->bus; + struct kdbus_pool_slice *slice = NULL; + struct kdbus_item *meta_items = NULL; + struct kdbus_item_header item_hdr; + struct kdbus_info info = {}; + size_t meta_size, name_len, cnt = 0; + struct kvec kvec[6]; + u64 attach_flags, size = 0; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + ret = kdbus_sanitize_attach_flags(cmd->attach_flags, &attach_flags); + if (ret < 0) + goto exit; + + attach_flags &= bus->attach_flags_owner; + + ret = kdbus_meta_emit(bus->creator_meta, NULL, NULL, conn, + attach_flags, &meta_items, &meta_size); + if (ret < 0) + goto exit; + + name_len = strlen(bus->node.name) + 1; + info.id = bus->id; + info.flags = bus->bus_flags; + item_hdr.type = KDBUS_ITEM_MAKE_NAME; + item_hdr.size = KDBUS_ITEM_HEADER_SIZE + name_len; + + kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &size); + kdbus_kvec_set(&kvec[cnt++], &item_hdr, sizeof(item_hdr), &size); + kdbus_kvec_set(&kvec[cnt++], bus->node.name, name_len, &size); + cnt += !!kdbus_kvec_pad(&kvec[cnt], &size); + if (meta_size > 0) { + kdbus_kvec_set(&kvec[cnt++], meta_items, meta_size, &size); + cnt += !!kdbus_kvec_pad(&kvec[cnt], &size); + } + + info.size = size; + + slice = kdbus_pool_slice_alloc(conn->pool, size, false); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto exit; + } + + ret = kdbus_pool_slice_copy_kvec(slice, 0, kvec, cnt, size); + if (ret < 0) + goto exit; + + kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->info_size); + + if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) || + kdbus_member_set_user(&cmd->info_size, argp, + typeof(*cmd), info_size)) + ret = -EFAULT; + +exit: + kdbus_pool_slice_release(slice); + kfree(meta_items); + return kdbus_args_clear(&args, ret); +} diff -Nur linux-4.2/ipc/kdbus/bus.h linux-4.2-pck/ipc/kdbus/bus.h --- linux-4.2/ipc/kdbus/bus.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/bus.h 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_BUS_H +#define __KDBUS_BUS_H + +#include +#include +#include +#include +#include +#include + +#include "metadata.h" +#include "names.h" +#include "node.h" +#include "policy.h" + +struct kdbus_conn; +struct kdbus_domain; +struct kdbus_staging; +struct kdbus_user; + +/** + * struct kdbus_bus - bus in a domain + * @node: kdbus_node + * @id: ID of this bus in the domain + * @bus_flags: Simple pass-through flags from userspace to userspace + * @attach_flags_owner: KDBUS_ATTACH_* flags of bus creator that other + * connections can see or query + * @id128: Unique random 128 bit ID of this bus + * @bloom: Bloom parameters + * @domain: Domain of this bus + * @creator: Creator of the bus + * @creator_meta: Meta information about the bus creator + * @last_message_id: Last used message id + * @policy_db: Policy database for this bus + * @name_registry: Name registry of this bus + * @conn_rwlock: Read/Write lock for all lists of child connections + * @conn_hash: Map of connection IDs + * @monitors_list: Connections that monitor this bus + * @notify_list: List of pending kernel-generated messages + * @notify_lock: Notification list lock + * @notify_flush_lock: Notification flushing lock + */ +struct kdbus_bus { + struct kdbus_node node; + + /* static */ + u64 id; + u64 bus_flags; + u64 attach_flags_owner; + u8 id128[16]; + struct kdbus_bloom_parameter bloom; + struct kdbus_domain *domain; + struct kdbus_user *creator; + struct kdbus_meta_proc *creator_meta; + + /* protected by own locks */ + atomic64_t last_message_id; + struct kdbus_policy_db policy_db; + struct kdbus_name_registry *name_registry; + + /* protected by conn_rwlock */ + struct rw_semaphore conn_rwlock; + DECLARE_HASHTABLE(conn_hash, 8); + struct list_head monitors_list; + + /* protected by notify_lock */ + struct list_head notify_list; + spinlock_t notify_lock; + struct mutex notify_flush_lock; +}; + +struct kdbus_bus *kdbus_bus_ref(struct kdbus_bus *bus); +struct kdbus_bus *kdbus_bus_unref(struct kdbus_bus *bus); + +struct kdbus_conn *kdbus_bus_find_conn_by_id(struct kdbus_bus *bus, u64 id); +void kdbus_bus_broadcast(struct kdbus_bus *bus, + struct kdbus_conn *conn_src, + struct kdbus_staging *staging); +void kdbus_bus_eavesdrop(struct kdbus_bus *bus, + struct kdbus_conn *conn_src, + struct kdbus_staging *staging); + +struct kdbus_bus *kdbus_cmd_bus_make(struct kdbus_domain *domain, + void __user *argp); +int kdbus_cmd_bus_creator_info(struct kdbus_conn *conn, void __user *argp); + +#endif diff -Nur linux-4.2/ipc/kdbus/connection.c linux-4.2-pck/ipc/kdbus/connection.c --- linux-4.2/ipc/kdbus/connection.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/connection.c 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,2227 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "handle.h" +#include "match.h" +#include "message.h" +#include "metadata.h" +#include "names.h" +#include "domain.h" +#include "item.h" +#include "notify.h" +#include "policy.h" +#include "pool.h" +#include "reply.h" +#include "util.h" +#include "queue.h" + +#define KDBUS_CONN_ACTIVE_BIAS (INT_MIN + 2) +#define KDBUS_CONN_ACTIVE_NEW (INT_MIN + 1) + +static struct kdbus_conn *kdbus_conn_new(struct kdbus_ep *ep, + struct file *file, + struct kdbus_cmd_hello *hello, + const char *name, + const struct kdbus_creds *creds, + const struct kdbus_pids *pids, + const char *seclabel, + const char *conn_description) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key __key; +#endif + struct kdbus_pool_slice *slice = NULL; + struct kdbus_bus *bus = ep->bus; + struct kdbus_conn *conn; + u64 attach_flags_send; + u64 attach_flags_recv; + u64 items_size = 0; + bool is_policy_holder; + bool is_activator; + bool is_monitor; + bool privileged; + bool owner; + struct kvec kvec; + int ret; + + struct { + u64 size; + u64 type; + struct kdbus_bloom_parameter bloom; + } bloom_item; + + privileged = kdbus_ep_is_privileged(ep, file); + owner = kdbus_ep_is_owner(ep, file); + + is_monitor = hello->flags & KDBUS_HELLO_MONITOR; + is_activator = hello->flags & KDBUS_HELLO_ACTIVATOR; + is_policy_holder = hello->flags & KDBUS_HELLO_POLICY_HOLDER; + + if (!hello->pool_size || !IS_ALIGNED(hello->pool_size, PAGE_SIZE)) + return ERR_PTR(-EINVAL); + if (is_monitor + is_activator + is_policy_holder > 1) + return ERR_PTR(-EINVAL); + if (name && !is_activator && !is_policy_holder) + return ERR_PTR(-EINVAL); + if (!name && (is_activator || is_policy_holder)) + return ERR_PTR(-EINVAL); + if (name && !kdbus_name_is_valid(name, true)) + return ERR_PTR(-EINVAL); + if (is_monitor && ep->user) + return ERR_PTR(-EOPNOTSUPP); + if (!owner && (is_activator || is_policy_holder || is_monitor)) + return ERR_PTR(-EPERM); + if (!owner && (creds || pids || seclabel)) + return ERR_PTR(-EPERM); + + ret = kdbus_sanitize_attach_flags(hello->attach_flags_send, + &attach_flags_send); + if (ret < 0) + return ERR_PTR(ret); + + ret = kdbus_sanitize_attach_flags(hello->attach_flags_recv, + &attach_flags_recv); + if (ret < 0) + return ERR_PTR(ret); + + conn = kzalloc(sizeof(*conn), GFP_KERNEL); + if (!conn) + return ERR_PTR(-ENOMEM); + + kref_init(&conn->kref); + atomic_set(&conn->active, KDBUS_CONN_ACTIVE_NEW); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_init_map(&conn->dep_map, "s_active", &__key, 0); +#endif + mutex_init(&conn->lock); + INIT_LIST_HEAD(&conn->names_list); + INIT_LIST_HEAD(&conn->reply_list); + atomic_set(&conn->request_count, 0); + atomic_set(&conn->lost_count, 0); + INIT_DELAYED_WORK(&conn->work, kdbus_reply_list_scan_work); + conn->cred = get_cred(file->f_cred); + conn->pid = get_pid(task_pid(current)); + get_fs_root(current->fs, &conn->root_path); + init_waitqueue_head(&conn->wait); + kdbus_queue_init(&conn->queue); + conn->privileged = privileged; + conn->owner = owner; + conn->ep = kdbus_ep_ref(ep); + conn->id = atomic64_inc_return(&bus->domain->last_id); + conn->flags = hello->flags; + atomic64_set(&conn->attach_flags_send, attach_flags_send); + atomic64_set(&conn->attach_flags_recv, attach_flags_recv); + INIT_LIST_HEAD(&conn->monitor_entry); + + if (conn_description) { + conn->description = kstrdup(conn_description, GFP_KERNEL); + if (!conn->description) { + ret = -ENOMEM; + goto exit_unref; + } + } + + conn->pool = kdbus_pool_new(conn->description, hello->pool_size); + if (IS_ERR(conn->pool)) { + ret = PTR_ERR(conn->pool); + conn->pool = NULL; + goto exit_unref; + } + + conn->match_db = kdbus_match_db_new(); + if (IS_ERR(conn->match_db)) { + ret = PTR_ERR(conn->match_db); + conn->match_db = NULL; + goto exit_unref; + } + + /* return properties of this connection to the caller */ + hello->bus_flags = bus->bus_flags; + hello->id = conn->id; + + BUILD_BUG_ON(sizeof(bus->id128) != sizeof(hello->id128)); + memcpy(hello->id128, bus->id128, sizeof(hello->id128)); + + /* privileged processes can impersonate somebody else */ + if (creds || pids || seclabel) { + conn->meta_fake = kdbus_meta_fake_new(); + if (IS_ERR(conn->meta_fake)) { + ret = PTR_ERR(conn->meta_fake); + conn->meta_fake = NULL; + goto exit_unref; + } + + ret = kdbus_meta_fake_collect(conn->meta_fake, + creds, pids, seclabel); + if (ret < 0) + goto exit_unref; + } else { + conn->meta_proc = kdbus_meta_proc_new(); + if (IS_ERR(conn->meta_proc)) { + ret = PTR_ERR(conn->meta_proc); + conn->meta_proc = NULL; + goto exit_unref; + } + + ret = kdbus_meta_proc_collect(conn->meta_proc, + KDBUS_ATTACH_CREDS | + KDBUS_ATTACH_PIDS | + KDBUS_ATTACH_AUXGROUPS | + KDBUS_ATTACH_TID_COMM | + KDBUS_ATTACH_PID_COMM | + KDBUS_ATTACH_EXE | + KDBUS_ATTACH_CMDLINE | + KDBUS_ATTACH_CGROUP | + KDBUS_ATTACH_CAPS | + KDBUS_ATTACH_SECLABEL | + KDBUS_ATTACH_AUDIT); + if (ret < 0) + goto exit_unref; + } + + /* + * Account the connection against the current user (UID), or for + * custom endpoints use the anonymous user assigned to the endpoint. + * Note that limits are always accounted against the real UID, not + * the effective UID (cred->user always points to the accounting of + * cred->uid, not cred->euid). + * In case the caller is privileged, we allow changing the accounting + * to the faked user. + */ + if (ep->user) { + conn->user = kdbus_user_ref(ep->user); + } else { + kuid_t uid; + + if (conn->meta_fake && uid_valid(conn->meta_fake->uid) && + conn->privileged) + uid = conn->meta_fake->uid; + else + uid = conn->cred->uid; + + conn->user = kdbus_user_lookup(ep->bus->domain, uid); + if (IS_ERR(conn->user)) { + ret = PTR_ERR(conn->user); + conn->user = NULL; + goto exit_unref; + } + } + + if (atomic_inc_return(&conn->user->connections) > KDBUS_USER_MAX_CONN) { + /* decremented by destructor as conn->user is valid */ + ret = -EMFILE; + goto exit_unref; + } + + bloom_item.size = sizeof(bloom_item); + bloom_item.type = KDBUS_ITEM_BLOOM_PARAMETER; + bloom_item.bloom = bus->bloom; + kdbus_kvec_set(&kvec, &bloom_item, bloom_item.size, &items_size); + + slice = kdbus_pool_slice_alloc(conn->pool, items_size, false); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto exit_unref; + } + + ret = kdbus_pool_slice_copy_kvec(slice, 0, &kvec, 1, items_size); + if (ret < 0) + goto exit_unref; + + kdbus_pool_slice_publish(slice, &hello->offset, &hello->items_size); + kdbus_pool_slice_release(slice); + + return conn; + +exit_unref: + kdbus_pool_slice_release(slice); + kdbus_conn_unref(conn); + return ERR_PTR(ret); +} + +static void __kdbus_conn_free(struct kref *kref) +{ + struct kdbus_conn *conn = container_of(kref, struct kdbus_conn, kref); + + WARN_ON(kdbus_conn_active(conn)); + WARN_ON(delayed_work_pending(&conn->work)); + WARN_ON(!list_empty(&conn->queue.msg_list)); + WARN_ON(!list_empty(&conn->names_list)); + WARN_ON(!list_empty(&conn->reply_list)); + + if (conn->user) { + atomic_dec(&conn->user->connections); + kdbus_user_unref(conn->user); + } + + kdbus_meta_fake_free(conn->meta_fake); + kdbus_meta_proc_unref(conn->meta_proc); + kdbus_match_db_free(conn->match_db); + kdbus_pool_free(conn->pool); + kdbus_ep_unref(conn->ep); + path_put(&conn->root_path); + put_pid(conn->pid); + put_cred(conn->cred); + kfree(conn->description); + kfree(conn->quota); + kfree(conn); +} + +/** + * kdbus_conn_ref() - take a connection reference + * @conn: Connection, may be %NULL + * + * Return: the connection itself + */ +struct kdbus_conn *kdbus_conn_ref(struct kdbus_conn *conn) +{ + if (conn) + kref_get(&conn->kref); + return conn; +} + +/** + * kdbus_conn_unref() - drop a connection reference + * @conn: Connection (may be NULL) + * + * When the last reference is dropped, the connection's internal structure + * is freed. + * + * Return: NULL + */ +struct kdbus_conn *kdbus_conn_unref(struct kdbus_conn *conn) +{ + if (conn) + kref_put(&conn->kref, __kdbus_conn_free); + return NULL; +} + +/** + * kdbus_conn_active() - connection is not disconnected + * @conn: Connection to check + * + * Return true if the connection was not disconnected, yet. Note that a + * connection might be disconnected asynchronously, unless you hold the + * connection lock. If that's not suitable for you, see kdbus_conn_acquire() to + * suppress connection shutdown for a short period. + * + * Return: true if the connection is still active + */ +bool kdbus_conn_active(const struct kdbus_conn *conn) +{ + return atomic_read(&conn->active) >= 0; +} + +/** + * kdbus_conn_acquire() - acquire an active connection reference + * @conn: Connection + * + * Users can close a connection via KDBUS_BYEBYE (or by destroying the + * endpoint/bus/...) at any time. Whenever this happens, we should deny any + * user-visible action on this connection and signal ECONNRESET instead. + * To avoid testing for connection availability everytime you take the + * connection-lock, you can acquire a connection for short periods. + * + * By calling kdbus_conn_acquire(), you gain an "active reference" to the + * connection. You must also hold a regular reference at any time! As long as + * you hold the active-ref, the connection will not be shut down. However, if + * the connection was shut down, you can never acquire an active-ref again. + * + * kdbus_conn_disconnect() disables the connection and then waits for all active + * references to be dropped. It will also wake up any pending operation. + * However, you must not sleep for an indefinite period while holding an + * active-reference. Otherwise, kdbus_conn_disconnect() might stall. If you need + * to sleep for an indefinite period, either release the reference and try to + * acquire it again after waking up, or make kdbus_conn_disconnect() wake up + * your wait-queue. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_conn_acquire(struct kdbus_conn *conn) +{ + if (!atomic_inc_unless_negative(&conn->active)) + return -ECONNRESET; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_acquire_read(&conn->dep_map, 0, 1, _RET_IP_); +#endif + + return 0; +} + +/** + * kdbus_conn_release() - release an active connection reference + * @conn: Connection + * + * This releases an active reference that has been acquired via + * kdbus_conn_acquire(). If the connection was already disabled and this is the + * last active-ref that is dropped, the disconnect-waiter will be woken up and + * properly close the connection. + */ +void kdbus_conn_release(struct kdbus_conn *conn) +{ + int v; + + if (!conn) + return; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_release(&conn->dep_map, 1, _RET_IP_); +#endif + + v = atomic_dec_return(&conn->active); + if (v != KDBUS_CONN_ACTIVE_BIAS) + return; + + wake_up_all(&conn->wait); +} + +static int kdbus_conn_connect(struct kdbus_conn *conn, const char *name) +{ + struct kdbus_ep *ep = conn->ep; + struct kdbus_bus *bus = ep->bus; + int ret; + + if (WARN_ON(atomic_read(&conn->active) != KDBUS_CONN_ACTIVE_NEW)) + return -EALREADY; + + /* make sure the ep-node is active while we add our connection */ + if (!kdbus_node_acquire(&ep->node)) + return -ESHUTDOWN; + + /* lock order: domain -> bus -> ep -> names -> conn */ + mutex_lock(&ep->lock); + down_write(&bus->conn_rwlock); + + /* link into monitor list */ + if (kdbus_conn_is_monitor(conn)) + list_add_tail(&conn->monitor_entry, &bus->monitors_list); + + /* link into bus and endpoint */ + list_add_tail(&conn->ep_entry, &ep->conn_list); + hash_add(bus->conn_hash, &conn->hentry, conn->id); + + /* enable lookups and acquire active ref */ + atomic_set(&conn->active, 1); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_acquire_read(&conn->dep_map, 0, 1, _RET_IP_); +#endif + + up_write(&bus->conn_rwlock); + mutex_unlock(&ep->lock); + + kdbus_node_release(&ep->node); + + /* + * Notify subscribers about the new active connection, unless it is + * a monitor. Monitors are invisible on the bus, can't be addressed + * directly, and won't cause any notifications. + */ + if (!kdbus_conn_is_monitor(conn)) { + ret = kdbus_notify_id_change(bus, KDBUS_ITEM_ID_ADD, + conn->id, conn->flags); + if (ret < 0) + goto exit_disconnect; + } + + if (kdbus_conn_is_activator(conn)) { + u64 flags = KDBUS_NAME_ACTIVATOR; + + if (WARN_ON(!name)) { + ret = -EINVAL; + goto exit_disconnect; + } + + ret = kdbus_name_acquire(bus->name_registry, conn, name, + flags, NULL); + if (ret < 0) + goto exit_disconnect; + } + + kdbus_conn_release(conn); + kdbus_notify_flush(bus); + return 0; + +exit_disconnect: + kdbus_conn_release(conn); + kdbus_conn_disconnect(conn, false); + return ret; +} + +/** + * kdbus_conn_disconnect() - disconnect a connection + * @conn: The connection to disconnect + * @ensure_queue_empty: Flag to indicate if the call should fail in + * case the connection's message list is not + * empty + * + * If @ensure_msg_list_empty is true, and the connection has pending messages, + * -EBUSY is returned. + * + * Return: 0 on success, negative errno on failure + */ +int kdbus_conn_disconnect(struct kdbus_conn *conn, bool ensure_queue_empty) +{ + struct kdbus_queue_entry *entry, *tmp; + struct kdbus_bus *bus = conn->ep->bus; + struct kdbus_reply *r, *r_tmp; + struct kdbus_conn *c; + int i, v; + + mutex_lock(&conn->lock); + v = atomic_read(&conn->active); + if (v == KDBUS_CONN_ACTIVE_NEW) { + /* was never connected */ + mutex_unlock(&conn->lock); + return 0; + } + if (v < 0) { + /* already dead */ + mutex_unlock(&conn->lock); + return -ECONNRESET; + } + if (ensure_queue_empty && !list_empty(&conn->queue.msg_list)) { + /* still busy */ + mutex_unlock(&conn->lock); + return -EBUSY; + } + + atomic_add(KDBUS_CONN_ACTIVE_BIAS, &conn->active); + mutex_unlock(&conn->lock); + + wake_up_interruptible(&conn->wait); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + rwsem_acquire(&conn->dep_map, 0, 0, _RET_IP_); + if (atomic_read(&conn->active) != KDBUS_CONN_ACTIVE_BIAS) + lock_contended(&conn->dep_map, _RET_IP_); +#endif + + wait_event(conn->wait, + atomic_read(&conn->active) == KDBUS_CONN_ACTIVE_BIAS); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lock_acquired(&conn->dep_map, _RET_IP_); + rwsem_release(&conn->dep_map, 1, _RET_IP_); +#endif + + cancel_delayed_work_sync(&conn->work); + kdbus_policy_remove_owner(&conn->ep->bus->policy_db, conn); + + /* lock order: domain -> bus -> ep -> names -> conn */ + mutex_lock(&conn->ep->lock); + down_write(&bus->conn_rwlock); + + /* remove from bus and endpoint */ + hash_del(&conn->hentry); + list_del(&conn->monitor_entry); + list_del(&conn->ep_entry); + + up_write(&bus->conn_rwlock); + mutex_unlock(&conn->ep->lock); + + /* + * Remove all names associated with this connection; this possibly + * moves queued messages back to the activator connection. + */ + kdbus_name_release_all(bus->name_registry, conn); + + /* if we die while other connections wait for our reply, notify them */ + mutex_lock(&conn->lock); + list_for_each_entry_safe(entry, tmp, &conn->queue.msg_list, entry) { + if (entry->reply) + kdbus_notify_reply_dead(bus, + entry->reply->reply_dst->id, + entry->reply->cookie); + kdbus_queue_entry_free(entry); + } + + list_for_each_entry_safe(r, r_tmp, &conn->reply_list, entry) + kdbus_reply_unlink(r); + mutex_unlock(&conn->lock); + + /* lock order: domain -> bus -> ep -> names -> conn */ + down_read(&bus->conn_rwlock); + hash_for_each(bus->conn_hash, i, c, hentry) { + mutex_lock(&c->lock); + list_for_each_entry_safe(r, r_tmp, &c->reply_list, entry) { + if (r->reply_src != conn) + continue; + + if (r->sync) + kdbus_sync_reply_wakeup(r, -EPIPE); + else + /* send a 'connection dead' notification */ + kdbus_notify_reply_dead(bus, c->id, r->cookie); + + kdbus_reply_unlink(r); + } + mutex_unlock(&c->lock); + } + up_read(&bus->conn_rwlock); + + if (!kdbus_conn_is_monitor(conn)) + kdbus_notify_id_change(bus, KDBUS_ITEM_ID_REMOVE, + conn->id, conn->flags); + + kdbus_notify_flush(bus); + + return 0; +} + +/** + * kdbus_conn_has_name() - check if a connection owns a name + * @conn: Connection + * @name: Well-know name to check for + * + * The caller must hold the registry lock of conn->ep->bus. + * + * Return: true if the name is currently owned by the connection + */ +bool kdbus_conn_has_name(struct kdbus_conn *conn, const char *name) +{ + struct kdbus_name_owner *owner; + + lockdep_assert_held(&conn->ep->bus->name_registry->rwlock); + + list_for_each_entry(owner, &conn->names_list, conn_entry) + if (!(owner->flags & KDBUS_NAME_IN_QUEUE) && + !strcmp(name, owner->name->name)) + return true; + + return false; +} + +struct kdbus_quota { + u32 memory; + u16 msgs; + u8 fds; +}; + +/** + * kdbus_conn_quota_inc() - increase quota accounting + * @c: connection owning the quota tracking + * @u: user to account for (or NULL for kernel accounting) + * @memory: size of memory to account for + * @fds: number of FDs to account for + * + * This call manages the quotas on resource @c. That is, it's used if other + * users want to use the resources of connection @c, which so far only concerns + * the receive queue of the destination. + * + * This increases the quota-accounting for user @u by @memory bytes and @fds + * file descriptors. If the user has already reached the quota limits, this call + * will not do any accounting but return a negative error code indicating the + * failure. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_conn_quota_inc(struct kdbus_conn *c, struct kdbus_user *u, + size_t memory, size_t fds) +{ + struct kdbus_quota *quota; + size_t available, accounted; + unsigned int id; + + /* + * Pool Layout: + * 50% of a pool is always owned by the connection. It is reserved for + * kernel queries, handling received messages and other tasks that are + * under control of the pool owner. The other 50% of the pool are used + * as incoming queue. + * As we optionally support user-space based policies, we need fair + * allocation schemes. Furthermore, resource utilization should be + * maximized, so only minimal resources stay reserved. However, we need + * to adapt to a dynamic number of users, as we cannot know how many + * users will talk to a connection. Therefore, the current allocation + * works like this: + * We limit the number of bytes in a destination's pool per sending + * user. The space available for a user is 33% of the unused pool space + * (whereas the space used by the user itself is also treated as + * 'unused'). This way, we favor users coming first, but keep enough + * pool space available for any following users. Given that messages are + * dequeued in FIFO order, this should balance nicely if the number of + * users grows. At the same time, this algorithm guarantees that the + * space available to a connection is reduced dynamically, the more + * concurrent users talk to a connection. + */ + + /* per user-accounting is expensive, so we keep state small */ + BUILD_BUG_ON(sizeof(quota->memory) != 4); + BUILD_BUG_ON(sizeof(quota->msgs) != 2); + BUILD_BUG_ON(sizeof(quota->fds) != 1); + BUILD_BUG_ON(KDBUS_CONN_MAX_MSGS > U16_MAX); + BUILD_BUG_ON(KDBUS_CONN_MAX_FDS_PER_USER > U8_MAX); + + id = u ? u->id : KDBUS_USER_KERNEL_ID; + if (id >= c->n_quota) { + unsigned int users; + + users = max(KDBUS_ALIGN8(id) + 8, id); + quota = krealloc(c->quota, users * sizeof(*quota), + GFP_KERNEL | __GFP_ZERO); + if (!quota) + return -ENOMEM; + + c->n_quota = users; + c->quota = quota; + } + + quota = &c->quota[id]; + kdbus_pool_accounted(c->pool, &available, &accounted); + + /* half the pool is _always_ reserved for the pool owner */ + available /= 2; + + /* + * Pool owner slices are un-accounted slices; they can claim more + * than 50% of the queue. However, the slices we're dealing with here + * belong to the incoming queue, hence they are 'accounted' slices + * to which the 50%-limit applies. + */ + if (available < accounted) + return -ENOBUFS; + + /* 1/3 of the remaining space (including your own memory) */ + available = (available - accounted + quota->memory) / 3; + + if (available < quota->memory || + available - quota->memory < memory || + quota->memory + memory > U32_MAX) + return -ENOBUFS; + if (quota->msgs >= KDBUS_CONN_MAX_MSGS) + return -ENOBUFS; + if (quota->fds + fds < quota->fds || + quota->fds + fds > KDBUS_CONN_MAX_FDS_PER_USER) + return -EMFILE; + + quota->memory += memory; + quota->fds += fds; + ++quota->msgs; + return 0; +} + +/** + * kdbus_conn_quota_dec() - decrease quota accounting + * @c: connection owning the quota tracking + * @u: user which was accounted for (or NULL for kernel accounting) + * @memory: size of memory which was accounted for + * @fds: number of FDs which were accounted for + * + * This does the reverse of kdbus_conn_quota_inc(). You have to release any + * accounted resources that you called kdbus_conn_quota_inc() for. However, you + * must not call kdbus_conn_quota_dec() if the accounting failed (that is, + * kdbus_conn_quota_inc() failed). + */ +void kdbus_conn_quota_dec(struct kdbus_conn *c, struct kdbus_user *u, + size_t memory, size_t fds) +{ + struct kdbus_quota *quota; + unsigned int id; + + id = u ? u->id : KDBUS_USER_KERNEL_ID; + if (WARN_ON(id >= c->n_quota)) + return; + + quota = &c->quota[id]; + + if (!WARN_ON(quota->msgs == 0)) + --quota->msgs; + if (!WARN_ON(quota->memory < memory)) + quota->memory -= memory; + if (!WARN_ON(quota->fds < fds)) + quota->fds -= fds; +} + +/** + * kdbus_conn_lost_message() - handle lost messages + * @c: connection that lost a message + * + * kdbus is reliable. That means, we try hard to never lose messages. However, + * memory is limited, so we cannot rely on transmissions to never fail. + * Therefore, we use quota-limits to let callers know if their unicast message + * cannot be transmitted to a peer. This works fine for unicasts, but for + * broadcasts we cannot make the caller handle the transmission failure. + * Instead, we must let the destination know that it couldn't receive a + * broadcast. + * As this is an unlikely scenario, we keep it simple. A single lost-counter + * remembers the number of lost messages since the last call to RECV. The next + * message retrieval will notify the connection that it lost messages since the + * last message retrieval and thus should resync its state. + */ +void kdbus_conn_lost_message(struct kdbus_conn *c) +{ + if (atomic_inc_return(&c->lost_count) == 1) + wake_up_interruptible(&c->wait); +} + +/* Callers should take the conn_dst lock */ +static struct kdbus_queue_entry * +kdbus_conn_entry_make(struct kdbus_conn *conn_src, + struct kdbus_conn *conn_dst, + struct kdbus_staging *staging) +{ + /* The remote connection was disconnected */ + if (!kdbus_conn_active(conn_dst)) + return ERR_PTR(-ECONNRESET); + + /* + * If the connection does not accept file descriptors but the message + * has some attached, refuse it. + * + * If this is a monitor connection, accept the message. In that + * case, all file descriptors will be set to -1 at receive time. + */ + if (!kdbus_conn_is_monitor(conn_dst) && + !(conn_dst->flags & KDBUS_HELLO_ACCEPT_FD) && + staging->gaps && staging->gaps->n_fds > 0) + return ERR_PTR(-ECOMM); + + return kdbus_queue_entry_new(conn_src, conn_dst, staging); +} + +/* + * Synchronously responding to a message, allocate a queue entry + * and attach it to the reply tracking object. + * The connection's queue will never get to see it. + */ +static int kdbus_conn_entry_sync_attach(struct kdbus_conn *conn_dst, + struct kdbus_staging *staging, + struct kdbus_reply *reply_wake) +{ + struct kdbus_queue_entry *entry; + int remote_ret, ret = 0; + + mutex_lock(&reply_wake->reply_dst->lock); + + /* + * If we are still waiting then proceed, allocate a queue + * entry and attach it to the reply object + */ + if (reply_wake->waiting) { + entry = kdbus_conn_entry_make(reply_wake->reply_src, conn_dst, + staging); + if (IS_ERR(entry)) + ret = PTR_ERR(entry); + else + /* Attach the entry to the reply object */ + reply_wake->queue_entry = entry; + } else { + ret = -ECONNRESET; + } + + /* + * Update the reply object and wake up remote peer only + * on appropriate return codes + * + * * -ECOMM: if the replying connection failed with -ECOMM + * then wakeup remote peer with -EREMOTEIO + * + * We do this to differenciate between -ECOMM errors + * from the original sender perspective: + * -ECOMM error during the sync send and + * -ECOMM error during the sync reply, this last + * one is rewritten to -EREMOTEIO + * + * * Wake up on all other return codes. + */ + remote_ret = ret; + + if (ret == -ECOMM) + remote_ret = -EREMOTEIO; + + kdbus_sync_reply_wakeup(reply_wake, remote_ret); + kdbus_reply_unlink(reply_wake); + mutex_unlock(&reply_wake->reply_dst->lock); + + return ret; +} + +/** + * kdbus_conn_entry_insert() - enqueue a message into the receiver's pool + * @conn_src: The sending connection + * @conn_dst: The connection to queue into + * @staging: Message to send + * @reply: The reply tracker to attach to the queue entry + * @name: Destination name this msg is sent to, or NULL + * + * Return: 0 on success. negative error otherwise. + */ +int kdbus_conn_entry_insert(struct kdbus_conn *conn_src, + struct kdbus_conn *conn_dst, + struct kdbus_staging *staging, + struct kdbus_reply *reply, + const struct kdbus_name_entry *name) +{ + struct kdbus_queue_entry *entry; + int ret; + + kdbus_conn_lock2(conn_src, conn_dst); + + entry = kdbus_conn_entry_make(conn_src, conn_dst, staging); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto exit_unlock; + } + + if (reply) { + kdbus_reply_link(reply); + if (!reply->sync) + schedule_delayed_work(&conn_src->work, 0); + } + + /* + * Record the sequence number of the registered name; it will + * be remembered by the queue, in case messages addressed to a + * name need to be moved from or to an activator. + */ + if (name) + entry->dst_name_id = name->name_id; + + kdbus_queue_entry_enqueue(entry, reply); + wake_up_interruptible(&conn_dst->wait); + + ret = 0; + +exit_unlock: + kdbus_conn_unlock2(conn_src, conn_dst); + return ret; +} + +static int kdbus_conn_wait_reply(struct kdbus_conn *conn_src, + struct kdbus_cmd_send *cmd_send, + struct file *ioctl_file, + struct file *cancel_fd, + struct kdbus_reply *reply_wait, + ktime_t expire) +{ + struct kdbus_queue_entry *entry; + struct poll_wqueues pwq = {}; + int ret; + + if (WARN_ON(!reply_wait)) + return -EIO; + + /* + * Block until the reply arrives. reply_wait is left untouched + * by the timeout scans that might be conducted for other, + * asynchronous replies of conn_src. + */ + + poll_initwait(&pwq); + poll_wait(ioctl_file, &conn_src->wait, &pwq.pt); + + for (;;) { + /* + * Any of the following conditions will stop our synchronously + * blocking SEND command: + * + * a) The origin sender closed its connection + * b) The remote peer answered, setting reply_wait->waiting = 0 + * c) The cancel FD was written to + * d) A signal was received + * e) The specified timeout was reached, and none of the above + * conditions kicked in. + */ + + /* + * We have already acquired an active reference when + * entering here, but another thread may call + * KDBUS_CMD_BYEBYE which does not acquire an active + * reference, therefore kdbus_conn_disconnect() will + * not wait for us. + */ + if (!kdbus_conn_active(conn_src)) { + ret = -ECONNRESET; + break; + } + + /* + * After the replying peer unset the waiting variable + * it will wake up us. + */ + if (!reply_wait->waiting) { + ret = reply_wait->err; + break; + } + + if (cancel_fd) { + unsigned int r; + + r = cancel_fd->f_op->poll(cancel_fd, &pwq.pt); + if (r & POLLIN) { + ret = -ECANCELED; + break; + } + } + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + if (!poll_schedule_timeout(&pwq, TASK_INTERRUPTIBLE, + &expire, 0)) { + ret = -ETIMEDOUT; + break; + } + + /* + * Reset the poll worker func, so the waitqueues are not + * added to the poll table again. We just reuse what we've + * collected earlier for further iterations. + */ + init_poll_funcptr(&pwq.pt, NULL); + } + + poll_freewait(&pwq); + + if (ret == -EINTR) { + /* + * Interrupted system call. Unref the reply object, and pass + * the return value down the chain. Mark the reply as + * interrupted, so the cleanup work can remove it, but do not + * unlink it from the list. Once the syscall restarts, we'll + * pick it up and wait on it again. + */ + mutex_lock(&conn_src->lock); + reply_wait->interrupted = true; + schedule_delayed_work(&conn_src->work, 0); + mutex_unlock(&conn_src->lock); + + return -ERESTARTSYS; + } + + mutex_lock(&conn_src->lock); + reply_wait->waiting = false; + entry = reply_wait->queue_entry; + if (entry) { + ret = kdbus_queue_entry_install(entry, + &cmd_send->reply.return_flags, + true); + kdbus_pool_slice_publish(entry->slice, &cmd_send->reply.offset, + &cmd_send->reply.msg_size); + kdbus_queue_entry_free(entry); + } + kdbus_reply_unlink(reply_wait); + mutex_unlock(&conn_src->lock); + + return ret; +} + +static int kdbus_pin_dst(struct kdbus_bus *bus, + struct kdbus_staging *staging, + struct kdbus_name_entry **out_name, + struct kdbus_conn **out_dst) +{ + const struct kdbus_msg *msg = staging->msg; + struct kdbus_name_owner *owner = NULL; + struct kdbus_name_entry *name = NULL; + struct kdbus_conn *dst = NULL; + int ret; + + lockdep_assert_held(&bus->name_registry->rwlock); + + if (!staging->dst_name) { + dst = kdbus_bus_find_conn_by_id(bus, msg->dst_id); + if (!dst) + return -ENXIO; + + if (!kdbus_conn_is_ordinary(dst)) { + ret = -ENXIO; + goto error; + } + } else { + name = kdbus_name_lookup_unlocked(bus->name_registry, + staging->dst_name); + if (name) + owner = kdbus_name_get_owner(name); + if (!owner) + return -ESRCH; + + /* + * If both a name and a connection ID are given as destination + * of a message, check that the currently owning connection of + * the name matches the specified ID. + * This way, we allow userspace to send the message to a + * specific connection by ID only if the connection currently + * owns the given name. + */ + if (msg->dst_id != KDBUS_DST_ID_NAME && + msg->dst_id != owner->conn->id) + return -EREMCHG; + + if ((msg->flags & KDBUS_MSG_NO_AUTO_START) && + kdbus_conn_is_activator(owner->conn)) + return -EADDRNOTAVAIL; + + dst = kdbus_conn_ref(owner->conn); + } + + *out_name = name; + *out_dst = dst; + return 0; + +error: + kdbus_conn_unref(dst); + return ret; +} + +static int kdbus_conn_reply(struct kdbus_conn *src, + struct kdbus_staging *staging) +{ + const struct kdbus_msg *msg = staging->msg; + struct kdbus_name_entry *name = NULL; + struct kdbus_reply *reply, *wake = NULL; + struct kdbus_conn *dst = NULL; + struct kdbus_bus *bus = src->ep->bus; + int ret; + + if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) || + WARN_ON(msg->flags & KDBUS_MSG_EXPECT_REPLY) || + WARN_ON(msg->flags & KDBUS_MSG_SIGNAL)) + return -EINVAL; + + /* name-registry must be locked for lookup *and* collecting data */ + down_read(&bus->name_registry->rwlock); + + /* find and pin destination */ + + ret = kdbus_pin_dst(bus, staging, &name, &dst); + if (ret < 0) + goto exit; + + mutex_lock(&dst->lock); + reply = kdbus_reply_find(src, dst, msg->cookie_reply); + if (reply) { + if (reply->sync) + wake = kdbus_reply_ref(reply); + kdbus_reply_unlink(reply); + } + mutex_unlock(&dst->lock); + + if (!reply) { + ret = -EBADSLT; + goto exit; + } + + /* send message */ + + kdbus_bus_eavesdrop(bus, src, staging); + + if (wake) + ret = kdbus_conn_entry_sync_attach(dst, staging, wake); + else + ret = kdbus_conn_entry_insert(src, dst, staging, NULL, name); + +exit: + up_read(&bus->name_registry->rwlock); + kdbus_reply_unref(wake); + kdbus_conn_unref(dst); + return ret; +} + +static struct kdbus_reply *kdbus_conn_call(struct kdbus_conn *src, + struct kdbus_staging *staging, + ktime_t exp) +{ + const struct kdbus_msg *msg = staging->msg; + struct kdbus_name_entry *name = NULL; + struct kdbus_reply *wait = NULL; + struct kdbus_conn *dst = NULL; + struct kdbus_bus *bus = src->ep->bus; + int ret; + + if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) || + WARN_ON(msg->flags & KDBUS_MSG_SIGNAL) || + WARN_ON(!(msg->flags & KDBUS_MSG_EXPECT_REPLY))) + return ERR_PTR(-EINVAL); + + /* resume previous wait-context, if available */ + + mutex_lock(&src->lock); + wait = kdbus_reply_find(NULL, src, msg->cookie); + if (wait) { + if (wait->interrupted) { + kdbus_reply_ref(wait); + wait->interrupted = false; + } else { + wait = NULL; + } + } + mutex_unlock(&src->lock); + + if (wait) + return wait; + + if (ktime_compare(ktime_get(), exp) >= 0) + return ERR_PTR(-ETIMEDOUT); + + /* name-registry must be locked for lookup *and* collecting data */ + down_read(&bus->name_registry->rwlock); + + /* find and pin destination */ + + ret = kdbus_pin_dst(bus, staging, &name, &dst); + if (ret < 0) + goto exit; + + if (!kdbus_conn_policy_talk(src, current_cred(), dst)) { + ret = -EPERM; + goto exit; + } + + wait = kdbus_reply_new(dst, src, msg, name, true); + if (IS_ERR(wait)) { + ret = PTR_ERR(wait); + wait = NULL; + goto exit; + } + + /* send message */ + + kdbus_bus_eavesdrop(bus, src, staging); + + ret = kdbus_conn_entry_insert(src, dst, staging, wait, name); + if (ret < 0) + goto exit; + + ret = 0; + +exit: + up_read(&bus->name_registry->rwlock); + if (ret < 0) { + kdbus_reply_unref(wait); + wait = ERR_PTR(ret); + } + kdbus_conn_unref(dst); + return wait; +} + +static int kdbus_conn_unicast(struct kdbus_conn *src, + struct kdbus_staging *staging) +{ + const struct kdbus_msg *msg = staging->msg; + struct kdbus_name_entry *name = NULL; + struct kdbus_reply *wait = NULL; + struct kdbus_conn *dst = NULL; + struct kdbus_bus *bus = src->ep->bus; + bool is_signal = (msg->flags & KDBUS_MSG_SIGNAL); + int ret = 0; + + if (WARN_ON(msg->dst_id == KDBUS_DST_ID_BROADCAST) || + WARN_ON(!(msg->flags & KDBUS_MSG_EXPECT_REPLY) && + msg->cookie_reply != 0)) + return -EINVAL; + + /* name-registry must be locked for lookup *and* collecting data */ + down_read(&bus->name_registry->rwlock); + + /* find and pin destination */ + + ret = kdbus_pin_dst(bus, staging, &name, &dst); + if (ret < 0) + goto exit; + + if (is_signal) { + /* like broadcasts we eavesdrop even if the msg is dropped */ + kdbus_bus_eavesdrop(bus, src, staging); + + /* drop silently if peer is not interested or not privileged */ + if (!kdbus_match_db_match_msg(dst->match_db, src, staging) || + !kdbus_conn_policy_talk(dst, NULL, src)) + goto exit; + } else if (!kdbus_conn_policy_talk(src, current_cred(), dst)) { + ret = -EPERM; + goto exit; + } else if (msg->flags & KDBUS_MSG_EXPECT_REPLY) { + wait = kdbus_reply_new(dst, src, msg, name, false); + if (IS_ERR(wait)) { + ret = PTR_ERR(wait); + wait = NULL; + goto exit; + } + } + + /* send message */ + + if (!is_signal) + kdbus_bus_eavesdrop(bus, src, staging); + + ret = kdbus_conn_entry_insert(src, dst, staging, wait, name); + if (ret < 0 && !is_signal) + goto exit; + + /* signals are treated like broadcasts, recv-errors are ignored */ + ret = 0; + +exit: + up_read(&bus->name_registry->rwlock); + kdbus_reply_unref(wait); + kdbus_conn_unref(dst); + return ret; +} + +/** + * kdbus_conn_move_messages() - move messages from one connection to another + * @conn_dst: Connection to copy to + * @conn_src: Connection to copy from + * @name_id: Filter for the sequence number of the registered + * name, 0 means no filtering. + * + * Move all messages from one connection to another. This is used when + * an implementer connection is taking over/giving back a well-known name + * from/to an activator connection. + */ +void kdbus_conn_move_messages(struct kdbus_conn *conn_dst, + struct kdbus_conn *conn_src, + u64 name_id) +{ + struct kdbus_queue_entry *e, *e_tmp; + struct kdbus_reply *r, *r_tmp; + struct kdbus_bus *bus; + struct kdbus_conn *c; + LIST_HEAD(msg_list); + int i, ret = 0; + + if (WARN_ON(conn_src == conn_dst)) + return; + + bus = conn_src->ep->bus; + + /* lock order: domain -> bus -> ep -> names -> conn */ + down_read(&bus->conn_rwlock); + hash_for_each(bus->conn_hash, i, c, hentry) { + if (c == conn_src || c == conn_dst) + continue; + + mutex_lock(&c->lock); + list_for_each_entry_safe(r, r_tmp, &c->reply_list, entry) { + if (r->reply_src != conn_src) + continue; + + /* filter messages for a specific name */ + if (name_id > 0 && r->name_id != name_id) + continue; + + kdbus_conn_unref(r->reply_src); + r->reply_src = kdbus_conn_ref(conn_dst); + } + mutex_unlock(&c->lock); + } + up_read(&bus->conn_rwlock); + + kdbus_conn_lock2(conn_src, conn_dst); + list_for_each_entry_safe(e, e_tmp, &conn_src->queue.msg_list, entry) { + /* filter messages for a specific name */ + if (name_id > 0 && e->dst_name_id != name_id) + continue; + + if (!(conn_dst->flags & KDBUS_HELLO_ACCEPT_FD) && + e->gaps && e->gaps->n_fds > 0) { + kdbus_conn_lost_message(conn_dst); + kdbus_queue_entry_free(e); + continue; + } + + ret = kdbus_queue_entry_move(e, conn_dst); + if (ret < 0) { + kdbus_conn_lost_message(conn_dst); + kdbus_queue_entry_free(e); + continue; + } + } + kdbus_conn_unlock2(conn_src, conn_dst); + + /* wake up poll() */ + wake_up_interruptible(&conn_dst->wait); +} + +/* query the policy-database for all names of @whom */ +static bool kdbus_conn_policy_query_all(struct kdbus_conn *conn, + const struct cred *conn_creds, + struct kdbus_policy_db *db, + struct kdbus_conn *whom, + unsigned int access) +{ + struct kdbus_name_owner *owner; + bool pass = false; + int res; + + lockdep_assert_held(&conn->ep->bus->name_registry->rwlock); + + down_read(&db->entries_rwlock); + mutex_lock(&whom->lock); + + list_for_each_entry(owner, &whom->names_list, conn_entry) { + if (owner->flags & KDBUS_NAME_IN_QUEUE) + continue; + + res = kdbus_policy_query_unlocked(db, + conn_creds ? : conn->cred, + owner->name->name, + kdbus_strhash(owner->name->name)); + if (res >= (int)access) { + pass = true; + break; + } + } + + mutex_unlock(&whom->lock); + up_read(&db->entries_rwlock); + + return pass; +} + +/** + * kdbus_conn_policy_own_name() - verify a connection can own the given name + * @conn: Connection + * @conn_creds: Credentials of @conn to use for policy check + * @name: Name + * + * This verifies that @conn is allowed to acquire the well-known name @name. + * + * Return: true if allowed, false if not. + */ +bool kdbus_conn_policy_own_name(struct kdbus_conn *conn, + const struct cred *conn_creds, + const char *name) +{ + unsigned int hash = kdbus_strhash(name); + int res; + + if (!conn_creds) + conn_creds = conn->cred; + + if (conn->ep->user) { + res = kdbus_policy_query(&conn->ep->policy_db, conn_creds, + name, hash); + if (res < KDBUS_POLICY_OWN) + return false; + } + + if (conn->owner) + return true; + + res = kdbus_policy_query(&conn->ep->bus->policy_db, conn_creds, + name, hash); + return res >= KDBUS_POLICY_OWN; +} + +/** + * kdbus_conn_policy_talk() - verify a connection can talk to a given peer + * @conn: Connection that tries to talk + * @conn_creds: Credentials of @conn to use for policy check + * @to: Connection that is talked to + * + * This verifies that @conn is allowed to talk to @to. + * + * Return: true if allowed, false if not. + */ +bool kdbus_conn_policy_talk(struct kdbus_conn *conn, + const struct cred *conn_creds, + struct kdbus_conn *to) +{ + if (!conn_creds) + conn_creds = conn->cred; + + if (conn->ep->user && + !kdbus_conn_policy_query_all(conn, conn_creds, &conn->ep->policy_db, + to, KDBUS_POLICY_TALK)) + return false; + + if (conn->owner) + return true; + if (uid_eq(conn_creds->euid, to->cred->uid)) + return true; + + return kdbus_conn_policy_query_all(conn, conn_creds, + &conn->ep->bus->policy_db, to, + KDBUS_POLICY_TALK); +} + +/** + * kdbus_conn_policy_see_name_unlocked() - verify a connection can see a given + * name + * @conn: Connection + * @conn_creds: Credentials of @conn to use for policy check + * @name: Name + * + * This verifies that @conn is allowed to see the well-known name @name. Caller + * must hold policy-lock. + * + * Return: true if allowed, false if not. + */ +bool kdbus_conn_policy_see_name_unlocked(struct kdbus_conn *conn, + const struct cred *conn_creds, + const char *name) +{ + int res; + + /* + * By default, all names are visible on a bus. SEE policies can only be + * installed on custom endpoints, where by default no name is visible. + */ + if (!conn->ep->user) + return true; + + res = kdbus_policy_query_unlocked(&conn->ep->policy_db, + conn_creds ? : conn->cred, + name, kdbus_strhash(name)); + return res >= KDBUS_POLICY_SEE; +} + +static bool kdbus_conn_policy_see_name(struct kdbus_conn *conn, + const struct cred *conn_creds, + const char *name) +{ + bool res; + + down_read(&conn->ep->policy_db.entries_rwlock); + res = kdbus_conn_policy_see_name_unlocked(conn, conn_creds, name); + up_read(&conn->ep->policy_db.entries_rwlock); + + return res; +} + +static bool kdbus_conn_policy_see(struct kdbus_conn *conn, + const struct cred *conn_creds, + struct kdbus_conn *whom) +{ + /* + * By default, all names are visible on a bus, so a connection can + * always see other connections. SEE policies can only be installed on + * custom endpoints, where by default no name is visible and we hide + * peers from each other, unless you see at least _one_ name of the + * peer. + */ + return !conn->ep->user || + kdbus_conn_policy_query_all(conn, conn_creds, + &conn->ep->policy_db, whom, + KDBUS_POLICY_SEE); +} + +/** + * kdbus_conn_policy_see_notification() - verify a connection is allowed to + * receive a given kernel notification + * @conn: Connection + * @conn_creds: Credentials of @conn to use for policy check + * @msg: Notification message + * + * This checks whether @conn is allowed to see the kernel notification. + * + * Return: true if allowed, false if not. + */ +bool kdbus_conn_policy_see_notification(struct kdbus_conn *conn, + const struct cred *conn_creds, + const struct kdbus_msg *msg) +{ + /* + * Depending on the notification type, broadcasted kernel notifications + * have to be filtered: + * + * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE}: This notification is forwarded + * to a peer if, and only if, that peer can see the name this + * notification is for. + * + * KDBUS_ITEM_ID_{ADD,REMOVE}: Notifications for ID changes are + * broadcast to everyone, to allow tracking peers. + */ + + switch (msg->items[0].type) { + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_REMOVE: + case KDBUS_ITEM_NAME_CHANGE: + return kdbus_conn_policy_see_name(conn, conn_creds, + msg->items[0].name_change.name); + + case KDBUS_ITEM_ID_ADD: + case KDBUS_ITEM_ID_REMOVE: + return true; + + default: + WARN(1, "Invalid type for notification broadcast: %llu\n", + (unsigned long long)msg->items[0].type); + return false; + } +} + +/** + * kdbus_cmd_hello() - handle KDBUS_CMD_HELLO + * @ep: Endpoint to operate on + * @file: File this connection is opened on + * @argp: Command payload + * + * Return: NULL or newly created connection on success, ERR_PTR on failure. + */ +struct kdbus_conn *kdbus_cmd_hello(struct kdbus_ep *ep, struct file *file, + void __user *argp) +{ + struct kdbus_cmd_hello *cmd; + struct kdbus_conn *c = NULL; + const char *item_name; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_NAME }, + { .type = KDBUS_ITEM_CREDS }, + { .type = KDBUS_ITEM_PIDS }, + { .type = KDBUS_ITEM_SECLABEL }, + { .type = KDBUS_ITEM_CONN_DESCRIPTION }, + { .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_HELLO_ACCEPT_FD | + KDBUS_HELLO_ACTIVATOR | + KDBUS_HELLO_POLICY_HOLDER | + KDBUS_HELLO_MONITOR, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + + item_name = argv[1].item ? argv[1].item->str : NULL; + + c = kdbus_conn_new(ep, file, cmd, item_name, + argv[2].item ? &argv[2].item->creds : NULL, + argv[3].item ? &argv[3].item->pids : NULL, + argv[4].item ? argv[4].item->str : NULL, + argv[5].item ? argv[5].item->str : NULL); + if (IS_ERR(c)) { + ret = PTR_ERR(c); + c = NULL; + goto exit; + } + + ret = kdbus_conn_connect(c, item_name); + if (ret < 0) + goto exit; + + if (kdbus_conn_is_activator(c) || kdbus_conn_is_policy_holder(c)) { + ret = kdbus_conn_acquire(c); + if (ret < 0) + goto exit; + + ret = kdbus_policy_set(&c->ep->bus->policy_db, args.items, + args.items_size, 1, + kdbus_conn_is_policy_holder(c), c); + kdbus_conn_release(c); + if (ret < 0) + goto exit; + } + + if (copy_to_user(argp, cmd, sizeof(*cmd))) + ret = -EFAULT; + +exit: + ret = kdbus_args_clear(&args, ret); + if (ret < 0) { + if (c) { + kdbus_conn_disconnect(c, false); + kdbus_conn_unref(c); + } + return ERR_PTR(ret); + } + return c; +} + +/** + * kdbus_cmd_byebye_unlocked() - handle KDBUS_CMD_BYEBYE + * @conn: connection to operate on + * @argp: command payload + * + * The caller must not hold any active reference to @conn or this will deadlock. + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_byebye_unlocked(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_cmd *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + ret = kdbus_conn_disconnect(conn, true); + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_conn_info() - handle KDBUS_CMD_CONN_INFO + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_conn_info(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_meta_conn *conn_meta = NULL; + struct kdbus_pool_slice *slice = NULL; + struct kdbus_name_entry *entry = NULL; + struct kdbus_name_owner *owner = NULL; + struct kdbus_conn *owner_conn = NULL; + struct kdbus_item *meta_items = NULL; + struct kdbus_info info = {}; + struct kdbus_cmd_info *cmd; + struct kdbus_bus *bus = conn->ep->bus; + struct kvec kvec[3]; + size_t meta_size, cnt = 0; + const char *name; + u64 attach_flags, size = 0; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_NAME }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + /* registry must be held throughout lookup *and* collecting data */ + down_read(&bus->name_registry->rwlock); + + ret = kdbus_sanitize_attach_flags(cmd->attach_flags, &attach_flags); + if (ret < 0) + goto exit; + + name = argv[1].item ? argv[1].item->str : NULL; + + if (name) { + entry = kdbus_name_lookup_unlocked(bus->name_registry, name); + if (entry) + owner = kdbus_name_get_owner(entry); + if (!owner || + !kdbus_conn_policy_see_name(conn, current_cred(), name) || + (cmd->id != 0 && owner->conn->id != cmd->id)) { + /* pretend a name doesn't exist if you cannot see it */ + ret = -ESRCH; + goto exit; + } + + owner_conn = kdbus_conn_ref(owner->conn); + } else if (cmd->id > 0) { + owner_conn = kdbus_bus_find_conn_by_id(bus, cmd->id); + if (!owner_conn || !kdbus_conn_policy_see(conn, current_cred(), + owner_conn)) { + /* pretend an id doesn't exist if you cannot see it */ + ret = -ENXIO; + goto exit; + } + } else { + ret = -EINVAL; + goto exit; + } + + attach_flags &= atomic64_read(&owner_conn->attach_flags_send); + + conn_meta = kdbus_meta_conn_new(); + if (IS_ERR(conn_meta)) { + ret = PTR_ERR(conn_meta); + conn_meta = NULL; + goto exit; + } + + ret = kdbus_meta_conn_collect(conn_meta, owner_conn, 0, attach_flags); + if (ret < 0) + goto exit; + + ret = kdbus_meta_emit(owner_conn->meta_proc, owner_conn->meta_fake, + conn_meta, conn, attach_flags, + &meta_items, &meta_size); + if (ret < 0) + goto exit; + + info.id = owner_conn->id; + info.flags = owner_conn->flags; + + kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &size); + if (meta_size > 0) { + kdbus_kvec_set(&kvec[cnt++], meta_items, meta_size, &size); + cnt += !!kdbus_kvec_pad(&kvec[cnt], &size); + } + + info.size = size; + + slice = kdbus_pool_slice_alloc(conn->pool, size, false); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto exit; + } + + ret = kdbus_pool_slice_copy_kvec(slice, 0, kvec, cnt, size); + if (ret < 0) + goto exit; + + kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->info_size); + + if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) || + kdbus_member_set_user(&cmd->info_size, argp, + typeof(*cmd), info_size)) { + ret = -EFAULT; + goto exit; + } + + ret = 0; + +exit: + up_read(&bus->name_registry->rwlock); + kdbus_pool_slice_release(slice); + kfree(meta_items); + kdbus_meta_conn_unref(conn_meta); + kdbus_conn_unref(owner_conn); + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_update() - handle KDBUS_CMD_UPDATE + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_update(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_item *item_policy; + u64 *item_attach_send = NULL; + u64 *item_attach_recv = NULL; + struct kdbus_cmd *cmd; + u64 attach_send; + u64 attach_recv; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_ATTACH_FLAGS_SEND }, + { .type = KDBUS_ITEM_ATTACH_FLAGS_RECV }, + { .type = KDBUS_ITEM_NAME, .multiple = true }, + { .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + item_attach_send = argv[1].item ? &argv[1].item->data64[0] : NULL; + item_attach_recv = argv[2].item ? &argv[2].item->data64[0] : NULL; + item_policy = argv[3].item ? : argv[4].item; + + if (item_attach_send) { + if (!kdbus_conn_is_ordinary(conn) && + !kdbus_conn_is_monitor(conn)) { + ret = -EOPNOTSUPP; + goto exit; + } + + ret = kdbus_sanitize_attach_flags(*item_attach_send, + &attach_send); + if (ret < 0) + goto exit; + } + + if (item_attach_recv) { + if (!kdbus_conn_is_ordinary(conn) && + !kdbus_conn_is_monitor(conn) && + !kdbus_conn_is_activator(conn)) { + ret = -EOPNOTSUPP; + goto exit; + } + + ret = kdbus_sanitize_attach_flags(*item_attach_recv, + &attach_recv); + if (ret < 0) + goto exit; + } + + if (item_policy && !kdbus_conn_is_policy_holder(conn)) { + ret = -EOPNOTSUPP; + goto exit; + } + + /* now that we verified the input, update the connection */ + + if (item_policy) { + ret = kdbus_policy_set(&conn->ep->bus->policy_db, cmd->items, + KDBUS_ITEMS_SIZE(cmd, items), + 1, true, conn); + if (ret < 0) + goto exit; + } + + if (item_attach_send) + atomic64_set(&conn->attach_flags_send, attach_send); + + if (item_attach_recv) + atomic64_set(&conn->attach_flags_recv, attach_recv); + +exit: + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_send() - handle KDBUS_CMD_SEND + * @conn: connection to operate on + * @f: file this command was called on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_send(struct kdbus_conn *conn, struct file *f, void __user *argp) +{ + struct kdbus_cmd_send *cmd; + struct kdbus_staging *staging = NULL; + struct kdbus_msg *msg = NULL; + struct file *cancel_fd = NULL; + int ret, ret2; + + /* command arguments */ + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_CANCEL_FD }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_SEND_SYNC_REPLY, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + /* message arguments */ + struct kdbus_arg msg_argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_PAYLOAD_VEC, .multiple = true }, + { .type = KDBUS_ITEM_PAYLOAD_MEMFD, .multiple = true }, + { .type = KDBUS_ITEM_FDS }, + { .type = KDBUS_ITEM_BLOOM_FILTER }, + { .type = KDBUS_ITEM_DST_NAME }, + }; + struct kdbus_args msg_args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_MSG_EXPECT_REPLY | + KDBUS_MSG_NO_AUTO_START | + KDBUS_MSG_SIGNAL, + .argv = msg_argv, + .argc = ARRAY_SIZE(msg_argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + /* make sure to parse both, @cmd and @msg on negotiation */ + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret < 0) + goto exit; + else if (ret > 0 && !cmd->msg_address) /* negotiation without msg */ + goto exit; + + ret2 = kdbus_args_parse_msg(&msg_args, KDBUS_PTR(cmd->msg_address), + &msg); + if (ret2 < 0) { /* cannot parse message */ + ret = ret2; + goto exit; + } else if (ret2 > 0 && !ret) { /* msg-negot implies cmd-negot */ + ret = -EINVAL; + goto exit; + } else if (ret > 0) { /* negotiation */ + goto exit; + } + + /* here we parsed both, @cmd and @msg, and neither wants negotiation */ + + cmd->reply.return_flags = 0; + kdbus_pool_publish_empty(conn->pool, &cmd->reply.offset, + &cmd->reply.msg_size); + + if (argv[1].item) { + cancel_fd = fget(argv[1].item->fds[0]); + if (!cancel_fd) { + ret = -EBADF; + goto exit; + } + + if (!cancel_fd->f_op->poll) { + ret = -EINVAL; + goto exit; + } + } + + /* patch-in the source of this message */ + if (msg->src_id > 0 && msg->src_id != conn->id) { + ret = -EINVAL; + goto exit; + } + msg->src_id = conn->id; + + staging = kdbus_staging_new_user(conn->ep->bus, cmd, msg); + if (IS_ERR(staging)) { + ret = PTR_ERR(staging); + staging = NULL; + goto exit; + } + + if (msg->dst_id == KDBUS_DST_ID_BROADCAST) { + down_read(&conn->ep->bus->name_registry->rwlock); + kdbus_bus_broadcast(conn->ep->bus, conn, staging); + up_read(&conn->ep->bus->name_registry->rwlock); + } else if (cmd->flags & KDBUS_SEND_SYNC_REPLY) { + struct kdbus_reply *r; + ktime_t exp; + + exp = ns_to_ktime(msg->timeout_ns); + r = kdbus_conn_call(conn, staging, exp); + if (IS_ERR(r)) { + ret = PTR_ERR(r); + goto exit; + } + + ret = kdbus_conn_wait_reply(conn, cmd, f, cancel_fd, r, exp); + kdbus_reply_unref(r); + if (ret < 0) + goto exit; + } else if ((msg->flags & KDBUS_MSG_EXPECT_REPLY) || + msg->cookie_reply == 0) { + ret = kdbus_conn_unicast(conn, staging); + if (ret < 0) + goto exit; + } else { + ret = kdbus_conn_reply(conn, staging); + if (ret < 0) + goto exit; + } + + if (kdbus_member_set_user(&cmd->reply, argp, typeof(*cmd), reply)) + ret = -EFAULT; + +exit: + if (cancel_fd) + fput(cancel_fd); + kdbus_staging_free(staging); + ret = kdbus_args_clear(&msg_args, ret); + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_recv() - handle KDBUS_CMD_RECV + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_recv(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_queue_entry *entry; + struct kdbus_cmd_recv *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_RECV_PEEK | + KDBUS_RECV_DROP | + KDBUS_RECV_USE_PRIORITY, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn) && + !kdbus_conn_is_monitor(conn) && + !kdbus_conn_is_activator(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + cmd->dropped_msgs = 0; + cmd->msg.return_flags = 0; + kdbus_pool_publish_empty(conn->pool, &cmd->msg.offset, + &cmd->msg.msg_size); + + /* DROP+priority is not realiably, so prevent it */ + if ((cmd->flags & KDBUS_RECV_DROP) && + (cmd->flags & KDBUS_RECV_USE_PRIORITY)) { + ret = -EINVAL; + goto exit; + } + + mutex_lock(&conn->lock); + + entry = kdbus_queue_peek(&conn->queue, cmd->priority, + cmd->flags & KDBUS_RECV_USE_PRIORITY); + if (!entry) { + mutex_unlock(&conn->lock); + ret = -EAGAIN; + } else if (cmd->flags & KDBUS_RECV_DROP) { + struct kdbus_reply *reply = kdbus_reply_ref(entry->reply); + + kdbus_queue_entry_free(entry); + + mutex_unlock(&conn->lock); + + if (reply) { + mutex_lock(&reply->reply_dst->lock); + if (!list_empty(&reply->entry)) { + kdbus_reply_unlink(reply); + if (reply->sync) + kdbus_sync_reply_wakeup(reply, -EPIPE); + else + kdbus_notify_reply_dead(conn->ep->bus, + reply->reply_dst->id, + reply->cookie); + } + mutex_unlock(&reply->reply_dst->lock); + kdbus_notify_flush(conn->ep->bus); + } + + kdbus_reply_unref(reply); + } else { + bool install_fds; + + /* + * PEEK just returns the location of the next message. Do not + * install FDs nor memfds nor anything else. The only + * information of interest should be the message header and + * metadata. Any FD numbers in the payload is undefined for + * PEEK'ed messages. + * Also make sure to never install fds into a connection that + * has refused to receive any. Ordinary connections will not get + * messages with FDs queued (the receiver will get -ECOMM), but + * eavesdroppers might. + */ + install_fds = (conn->flags & KDBUS_HELLO_ACCEPT_FD) && + !(cmd->flags & KDBUS_RECV_PEEK); + + ret = kdbus_queue_entry_install(entry, + &cmd->msg.return_flags, + install_fds); + if (ret < 0) { + mutex_unlock(&conn->lock); + goto exit; + } + + kdbus_pool_slice_publish(entry->slice, &cmd->msg.offset, + &cmd->msg.msg_size); + + if (!(cmd->flags & KDBUS_RECV_PEEK)) + kdbus_queue_entry_free(entry); + + mutex_unlock(&conn->lock); + } + + cmd->dropped_msgs = atomic_xchg(&conn->lost_count, 0); + if (cmd->dropped_msgs > 0) + cmd->return_flags |= KDBUS_RECV_RETURN_DROPPED_MSGS; + + if (kdbus_member_set_user(&cmd->msg, argp, typeof(*cmd), msg) || + kdbus_member_set_user(&cmd->dropped_msgs, argp, typeof(*cmd), + dropped_msgs)) + ret = -EFAULT; + +exit: + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_free() - handle KDBUS_CMD_FREE + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_free(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_cmd_free *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn) && + !kdbus_conn_is_monitor(conn) && + !kdbus_conn_is_activator(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + ret = kdbus_pool_release_offset(conn->pool, cmd->offset); + + return kdbus_args_clear(&args, ret); +} diff -Nur linux-4.2/ipc/kdbus/connection.h linux-4.2-pck/ipc/kdbus/connection.h --- linux-4.2/ipc/kdbus/connection.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/connection.h 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,260 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_CONNECTION_H +#define __KDBUS_CONNECTION_H + +#include +#include +#include +#include + +#include "limits.h" +#include "metadata.h" +#include "pool.h" +#include "queue.h" +#include "util.h" + +#define KDBUS_HELLO_SPECIAL_CONN (KDBUS_HELLO_ACTIVATOR | \ + KDBUS_HELLO_POLICY_HOLDER | \ + KDBUS_HELLO_MONITOR) + +struct kdbus_name_entry; +struct kdbus_quota; +struct kdbus_staging; + +/** + * struct kdbus_conn - connection to a bus + * @kref: Reference count + * @active: Active references to the connection + * @id: Connection ID + * @flags: KDBUS_HELLO_* flags + * @attach_flags_send: KDBUS_ATTACH_* flags for sending + * @attach_flags_recv: KDBUS_ATTACH_* flags for receiving + * @description: Human-readable connection description, used for + * debugging. This field is only set when the + * connection is created. + * @ep: The endpoint this connection belongs to + * @lock: Connection data lock + * @hentry: Entry in ID <-> connection map + * @ep_entry: Entry in endpoint + * @monitor_entry: Entry in monitor, if the connection is a monitor + * @reply_list: List of connections this connection should + * reply to + * @work: Delayed work to handle timeouts + * activator for + * @match_db: Subscription filter to broadcast messages + * @meta_proc: Process metadata of connection creator, or NULL + * @meta_fake: Faked metadata, or NULL + * @pool: The user's buffer to receive messages + * @user: Owner of the connection + * @cred: The credentials of the connection at creation time + * @pid: Pid at creation time + * @root_path: Root path at creation time + * @request_count: Number of pending requests issued by this + * connection that are waiting for replies from + * other peers + * @lost_count: Number of lost broadcast messages + * @wait: Wake up this endpoint + * @queue: The message queue associated with this connection + * @quota: Array of per-user quota indexed by user->id + * @n_quota: Number of elements in quota array + * @names_list: List of well-known names + * @name_count: Number of owned well-known names + * @privileged: Whether this connection is privileged on the domain + * @owner: Owned by the same user as the bus owner + */ +struct kdbus_conn { + struct kref kref; + atomic_t active; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif + u64 id; + u64 flags; + atomic64_t attach_flags_send; + atomic64_t attach_flags_recv; + const char *description; + struct kdbus_ep *ep; + struct mutex lock; + struct hlist_node hentry; + struct list_head ep_entry; + struct list_head monitor_entry; + struct list_head reply_list; + struct delayed_work work; + struct kdbus_match_db *match_db; + struct kdbus_meta_proc *meta_proc; + struct kdbus_meta_fake *meta_fake; + struct kdbus_pool *pool; + struct kdbus_user *user; + const struct cred *cred; + struct pid *pid; + struct path root_path; + atomic_t request_count; + atomic_t lost_count; + wait_queue_head_t wait; + struct kdbus_queue queue; + + struct kdbus_quota *quota; + unsigned int n_quota; + + /* protected by registry->rwlock */ + struct list_head names_list; + unsigned int name_count; + + bool privileged:1; + bool owner:1; +}; + +struct kdbus_conn *kdbus_conn_ref(struct kdbus_conn *conn); +struct kdbus_conn *kdbus_conn_unref(struct kdbus_conn *conn); +bool kdbus_conn_active(const struct kdbus_conn *conn); +int kdbus_conn_acquire(struct kdbus_conn *conn); +void kdbus_conn_release(struct kdbus_conn *conn); +int kdbus_conn_disconnect(struct kdbus_conn *conn, bool ensure_queue_empty); +bool kdbus_conn_has_name(struct kdbus_conn *conn, const char *name); +int kdbus_conn_quota_inc(struct kdbus_conn *c, struct kdbus_user *u, + size_t memory, size_t fds); +void kdbus_conn_quota_dec(struct kdbus_conn *c, struct kdbus_user *u, + size_t memory, size_t fds); +void kdbus_conn_lost_message(struct kdbus_conn *c); +int kdbus_conn_entry_insert(struct kdbus_conn *conn_src, + struct kdbus_conn *conn_dst, + struct kdbus_staging *staging, + struct kdbus_reply *reply, + const struct kdbus_name_entry *name); +void kdbus_conn_move_messages(struct kdbus_conn *conn_dst, + struct kdbus_conn *conn_src, + u64 name_id); + +/* policy */ +bool kdbus_conn_policy_own_name(struct kdbus_conn *conn, + const struct cred *conn_creds, + const char *name); +bool kdbus_conn_policy_talk(struct kdbus_conn *conn, + const struct cred *conn_creds, + struct kdbus_conn *to); +bool kdbus_conn_policy_see_name_unlocked(struct kdbus_conn *conn, + const struct cred *curr_creds, + const char *name); +bool kdbus_conn_policy_see_notification(struct kdbus_conn *conn, + const struct cred *curr_creds, + const struct kdbus_msg *msg); + +/* command dispatcher */ +struct kdbus_conn *kdbus_cmd_hello(struct kdbus_ep *ep, struct file *file, + void __user *argp); +int kdbus_cmd_byebye_unlocked(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_conn_info(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_update(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_send(struct kdbus_conn *conn, struct file *f, void __user *argp); +int kdbus_cmd_recv(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_free(struct kdbus_conn *conn, void __user *argp); + +/** + * kdbus_conn_is_ordinary() - Check if connection is ordinary + * @conn: The connection to check + * + * Return: Non-zero if the connection is an ordinary connection + */ +static inline int kdbus_conn_is_ordinary(const struct kdbus_conn *conn) +{ + return !(conn->flags & KDBUS_HELLO_SPECIAL_CONN); +} + +/** + * kdbus_conn_is_activator() - Check if connection is an activator + * @conn: The connection to check + * + * Return: Non-zero if the connection is an activator + */ +static inline int kdbus_conn_is_activator(const struct kdbus_conn *conn) +{ + return conn->flags & KDBUS_HELLO_ACTIVATOR; +} + +/** + * kdbus_conn_is_policy_holder() - Check if connection is a policy holder + * @conn: The connection to check + * + * Return: Non-zero if the connection is a policy holder + */ +static inline int kdbus_conn_is_policy_holder(const struct kdbus_conn *conn) +{ + return conn->flags & KDBUS_HELLO_POLICY_HOLDER; +} + +/** + * kdbus_conn_is_monitor() - Check if connection is a monitor + * @conn: The connection to check + * + * Return: Non-zero if the connection is a monitor + */ +static inline int kdbus_conn_is_monitor(const struct kdbus_conn *conn) +{ + return conn->flags & KDBUS_HELLO_MONITOR; +} + +/** + * kdbus_conn_lock2() - Lock two connections + * @a: connection A to lock or NULL + * @b: connection B to lock or NULL + * + * Lock two connections at once. As we need to have a stable locking order, we + * always lock the connection with lower memory address first. + */ +static inline void kdbus_conn_lock2(struct kdbus_conn *a, struct kdbus_conn *b) +{ + if (a < b) { + if (a) + mutex_lock(&a->lock); + if (b && b != a) + mutex_lock_nested(&b->lock, !!a); + } else { + if (b) + mutex_lock(&b->lock); + if (a && a != b) + mutex_lock_nested(&a->lock, !!b); + } +} + +/** + * kdbus_conn_unlock2() - Unlock two connections + * @a: connection A to unlock or NULL + * @b: connection B to unlock or NULL + * + * Unlock two connections at once. See kdbus_conn_lock2(). + */ +static inline void kdbus_conn_unlock2(struct kdbus_conn *a, + struct kdbus_conn *b) +{ + if (a) + mutex_unlock(&a->lock); + if (b && b != a) + mutex_unlock(&b->lock); +} + +/** + * kdbus_conn_assert_active() - lockdep assert on active lock + * @conn: connection that shall be active + * + * This verifies via lockdep that the caller holds an active reference to the + * given connection. + */ +static inline void kdbus_conn_assert_active(struct kdbus_conn *conn) +{ + lockdep_assert_held(conn); +} + +#endif diff -Nur linux-4.2/ipc/kdbus/domain.c linux-4.2-pck/ipc/kdbus/domain.c --- linux-4.2/ipc/kdbus/domain.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/domain.c 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,296 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "domain.h" +#include "handle.h" +#include "item.h" +#include "limits.h" +#include "util.h" + +static void kdbus_domain_control_free(struct kdbus_node *node) +{ + kfree(node); +} + +static struct kdbus_node *kdbus_domain_control_new(struct kdbus_domain *domain, + unsigned int access) +{ + struct kdbus_node *node; + int ret; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return ERR_PTR(-ENOMEM); + + kdbus_node_init(node, KDBUS_NODE_CONTROL); + + node->free_cb = kdbus_domain_control_free; + node->mode = domain->node.mode; + node->mode = S_IRUSR | S_IWUSR; + if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD)) + node->mode |= S_IRGRP | S_IWGRP; + if (access & KDBUS_MAKE_ACCESS_WORLD) + node->mode |= S_IROTH | S_IWOTH; + + ret = kdbus_node_link(node, &domain->node, "control"); + if (ret < 0) + goto exit_free; + + return node; + +exit_free: + kdbus_node_drain(node); + kdbus_node_unref(node); + return ERR_PTR(ret); +} + +static void kdbus_domain_free(struct kdbus_node *node) +{ + struct kdbus_domain *domain = + container_of(node, struct kdbus_domain, node); + + put_user_ns(domain->user_namespace); + ida_destroy(&domain->user_ida); + idr_destroy(&domain->user_idr); + kfree(domain); +} + +/** + * kdbus_domain_new() - create a new domain + * @access: The access mode for this node (KDBUS_MAKE_ACCESS_*) + * + * Return: a new kdbus_domain on success, ERR_PTR on failure + */ +struct kdbus_domain *kdbus_domain_new(unsigned int access) +{ + struct kdbus_domain *d; + int ret; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return ERR_PTR(-ENOMEM); + + kdbus_node_init(&d->node, KDBUS_NODE_DOMAIN); + + d->node.free_cb = kdbus_domain_free; + d->node.mode = S_IRUSR | S_IXUSR; + if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD)) + d->node.mode |= S_IRGRP | S_IXGRP; + if (access & KDBUS_MAKE_ACCESS_WORLD) + d->node.mode |= S_IROTH | S_IXOTH; + + mutex_init(&d->lock); + idr_init(&d->user_idr); + ida_init(&d->user_ida); + + /* Pin user namespace so we can guarantee domain-unique bus * names. */ + d->user_namespace = get_user_ns(current_user_ns()); + + ret = kdbus_node_link(&d->node, NULL, NULL); + if (ret < 0) + goto exit_unref; + + return d; + +exit_unref: + kdbus_node_drain(&d->node); + kdbus_node_unref(&d->node); + return ERR_PTR(ret); +} + +/** + * kdbus_domain_ref() - take a domain reference + * @domain: Domain + * + * Return: the domain itself + */ +struct kdbus_domain *kdbus_domain_ref(struct kdbus_domain *domain) +{ + if (domain) + kdbus_node_ref(&domain->node); + return domain; +} + +/** + * kdbus_domain_unref() - drop a domain reference + * @domain: Domain + * + * When the last reference is dropped, the domain internal structure + * is freed. + * + * Return: NULL + */ +struct kdbus_domain *kdbus_domain_unref(struct kdbus_domain *domain) +{ + if (domain) + kdbus_node_unref(&domain->node); + return NULL; +} + +/** + * kdbus_domain_populate() - populate static domain nodes + * @domain: domain to populate + * @access: KDBUS_MAKE_ACCESS_* access restrictions for new nodes + * + * Allocate and activate static sub-nodes of the given domain. This will fail if + * you call it on a non-active node or if the domain was already populated. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_domain_populate(struct kdbus_domain *domain, unsigned int access) +{ + struct kdbus_node *control; + + /* + * Create a control-node for this domain. We drop our own reference + * immediately, effectively causing the node to be deactivated and + * released when the parent domain is. + */ + control = kdbus_domain_control_new(domain, access); + if (IS_ERR(control)) + return PTR_ERR(control); + + kdbus_node_activate(control); + kdbus_node_unref(control); + return 0; +} + +/** + * kdbus_user_lookup() - lookup a kdbus_user object + * @domain: domain of the user + * @uid: uid of the user; INVALID_UID for an anon user + * + * Lookup the kdbus user accounting object for the given domain. If INVALID_UID + * is passed, a new anonymous user is created which is private to the caller. + * + * Return: The user object is returned, ERR_PTR on failure. + */ +struct kdbus_user *kdbus_user_lookup(struct kdbus_domain *domain, kuid_t uid) +{ + struct kdbus_user *u = NULL, *old = NULL; + int ret; + + mutex_lock(&domain->lock); + + if (uid_valid(uid)) { + old = idr_find(&domain->user_idr, __kuid_val(uid)); + /* + * If the object is about to be destroyed, ignore it and + * replace the slot in the IDR later on. + */ + if (old && kref_get_unless_zero(&old->kref)) { + mutex_unlock(&domain->lock); + return old; + } + } + + u = kzalloc(sizeof(*u), GFP_KERNEL); + if (!u) { + ret = -ENOMEM; + goto exit; + } + + kref_init(&u->kref); + u->domain = kdbus_domain_ref(domain); + u->uid = uid; + atomic_set(&u->buses, 0); + atomic_set(&u->connections, 0); + + if (uid_valid(uid)) { + if (old) { + idr_replace(&domain->user_idr, u, __kuid_val(uid)); + old->uid = INVALID_UID; /* mark old as removed */ + } else { + ret = idr_alloc(&domain->user_idr, u, __kuid_val(uid), + __kuid_val(uid) + 1, GFP_KERNEL); + if (ret < 0) + goto exit; + } + } + + /* + * Allocate the smallest possible index for this user; used + * in arrays for accounting user quota in receiver queues. + */ + ret = ida_simple_get(&domain->user_ida, 1, 0, GFP_KERNEL); + if (ret < 0) + goto exit; + + u->id = ret; + mutex_unlock(&domain->lock); + return u; + +exit: + if (u) { + if (uid_valid(u->uid)) + idr_remove(&domain->user_idr, __kuid_val(u->uid)); + kdbus_domain_unref(u->domain); + kfree(u); + } + mutex_unlock(&domain->lock); + return ERR_PTR(ret); +} + +static void __kdbus_user_free(struct kref *kref) +{ + struct kdbus_user *user = container_of(kref, struct kdbus_user, kref); + + WARN_ON(atomic_read(&user->buses) > 0); + WARN_ON(atomic_read(&user->connections) > 0); + + mutex_lock(&user->domain->lock); + ida_simple_remove(&user->domain->user_ida, user->id); + if (uid_valid(user->uid)) + idr_remove(&user->domain->user_idr, __kuid_val(user->uid)); + mutex_unlock(&user->domain->lock); + + kdbus_domain_unref(user->domain); + kfree(user); +} + +/** + * kdbus_user_ref() - take a user reference + * @u: User + * + * Return: @u is returned + */ +struct kdbus_user *kdbus_user_ref(struct kdbus_user *u) +{ + if (u) + kref_get(&u->kref); + return u; +} + +/** + * kdbus_user_unref() - drop a user reference + * @u: User + * + * Return: NULL + */ +struct kdbus_user *kdbus_user_unref(struct kdbus_user *u) +{ + if (u) + kref_put(&u->kref, __kdbus_user_free); + return NULL; +} diff -Nur linux-4.2/ipc/kdbus/domain.h linux-4.2-pck/ipc/kdbus/domain.h --- linux-4.2/ipc/kdbus/domain.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/domain.h 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_DOMAIN_H +#define __KDBUS_DOMAIN_H + +#include +#include +#include +#include + +#include "node.h" + +/** + * struct kdbus_domain - domain for buses + * @node: Underlying API node + * @lock: Domain data lock + * @last_id: Last used object id + * @user_idr: Set of all users indexed by UID + * @user_ida: Set of all users to compute small indices + * @user_namespace: User namespace, pinned at creation time + * @dentry: Root dentry of VFS mount (don't use outside of kdbusfs) + */ +struct kdbus_domain { + struct kdbus_node node; + struct mutex lock; + atomic64_t last_id; + struct idr user_idr; + struct ida user_ida; + struct user_namespace *user_namespace; + struct dentry *dentry; +}; + +/** + * struct kdbus_user - resource accounting for users + * @kref: Reference counter + * @domain: Domain of the user + * @id: Index of this user + * @uid: UID of the user + * @buses: Number of buses the user has created + * @connections: Number of connections the user has created + */ +struct kdbus_user { + struct kref kref; + struct kdbus_domain *domain; + unsigned int id; + kuid_t uid; + atomic_t buses; + atomic_t connections; +}; + +#define kdbus_domain_from_node(_node) \ + container_of((_node), struct kdbus_domain, node) + +struct kdbus_domain *kdbus_domain_new(unsigned int access); +struct kdbus_domain *kdbus_domain_ref(struct kdbus_domain *domain); +struct kdbus_domain *kdbus_domain_unref(struct kdbus_domain *domain); +int kdbus_domain_populate(struct kdbus_domain *domain, unsigned int access); + +#define KDBUS_USER_KERNEL_ID 0 /* ID 0 is reserved for kernel accounting */ + +struct kdbus_user *kdbus_user_lookup(struct kdbus_domain *domain, kuid_t uid); +struct kdbus_user *kdbus_user_ref(struct kdbus_user *u); +struct kdbus_user *kdbus_user_unref(struct kdbus_user *u); + +#endif diff -Nur linux-4.2/ipc/kdbus/endpoint.c linux-4.2-pck/ipc/kdbus/endpoint.c --- linux-4.2/ipc/kdbus/endpoint.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/endpoint.c 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,303 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "domain.h" +#include "endpoint.h" +#include "handle.h" +#include "item.h" +#include "message.h" +#include "policy.h" + +static void kdbus_ep_free(struct kdbus_node *node) +{ + struct kdbus_ep *ep = container_of(node, struct kdbus_ep, node); + + WARN_ON(!list_empty(&ep->conn_list)); + + kdbus_policy_db_clear(&ep->policy_db); + kdbus_bus_unref(ep->bus); + kdbus_user_unref(ep->user); + kfree(ep); +} + +static void kdbus_ep_release(struct kdbus_node *node, bool was_active) +{ + struct kdbus_ep *ep = container_of(node, struct kdbus_ep, node); + + /* disconnect all connections to this endpoint */ + for (;;) { + struct kdbus_conn *conn; + + mutex_lock(&ep->lock); + conn = list_first_entry_or_null(&ep->conn_list, + struct kdbus_conn, + ep_entry); + if (!conn) { + mutex_unlock(&ep->lock); + break; + } + + /* take reference, release lock, disconnect without lock */ + kdbus_conn_ref(conn); + mutex_unlock(&ep->lock); + + kdbus_conn_disconnect(conn, false); + kdbus_conn_unref(conn); + } +} + +/** + * kdbus_ep_new() - create a new endpoint + * @bus: The bus this endpoint will be created for + * @name: The name of the endpoint + * @access: The access flags for this node (KDBUS_MAKE_ACCESS_*) + * @uid: The uid of the node + * @gid: The gid of the node + * @is_custom: Whether this is a custom endpoint + * + * This function will create a new endpoint with the given + * name and properties for a given bus. + * + * Return: a new kdbus_ep on success, ERR_PTR on failure. + */ +struct kdbus_ep *kdbus_ep_new(struct kdbus_bus *bus, const char *name, + unsigned int access, kuid_t uid, kgid_t gid, + bool is_custom) +{ + struct kdbus_ep *e; + int ret; + + /* + * Validate only custom endpoints names, default endpoints + * with a "bus" name are created when the bus is created + */ + if (is_custom) { + ret = kdbus_verify_uid_prefix(name, bus->domain->user_namespace, + uid); + if (ret < 0) + return ERR_PTR(ret); + } + + e = kzalloc(sizeof(*e), GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + + kdbus_node_init(&e->node, KDBUS_NODE_ENDPOINT); + + e->node.free_cb = kdbus_ep_free; + e->node.release_cb = kdbus_ep_release; + e->node.uid = uid; + e->node.gid = gid; + e->node.mode = S_IRUSR | S_IWUSR; + if (access & (KDBUS_MAKE_ACCESS_GROUP | KDBUS_MAKE_ACCESS_WORLD)) + e->node.mode |= S_IRGRP | S_IWGRP; + if (access & KDBUS_MAKE_ACCESS_WORLD) + e->node.mode |= S_IROTH | S_IWOTH; + + mutex_init(&e->lock); + INIT_LIST_HEAD(&e->conn_list); + kdbus_policy_db_init(&e->policy_db); + e->bus = kdbus_bus_ref(bus); + + ret = kdbus_node_link(&e->node, &bus->node, name); + if (ret < 0) + goto exit_unref; + + /* + * Transactions on custom endpoints are never accounted on the global + * user limits. Instead, for each custom endpoint, we create a custom, + * unique user, which all transactions are accounted on. Regardless of + * the user using that endpoint, it is always accounted on the same + * user-object. This budget is not shared with ordinary users on + * non-custom endpoints. + */ + if (is_custom) { + e->user = kdbus_user_lookup(bus->domain, INVALID_UID); + if (IS_ERR(e->user)) { + ret = PTR_ERR(e->user); + e->user = NULL; + goto exit_unref; + } + } + + return e; + +exit_unref: + kdbus_node_drain(&e->node); + kdbus_node_unref(&e->node); + return ERR_PTR(ret); +} + +/** + * kdbus_ep_ref() - increase the reference counter of a kdbus_ep + * @ep: The endpoint to reference + * + * Every user of an endpoint, except for its creator, must add a reference to + * the kdbus_ep instance using this function. + * + * Return: the ep itself + */ +struct kdbus_ep *kdbus_ep_ref(struct kdbus_ep *ep) +{ + if (ep) + kdbus_node_ref(&ep->node); + return ep; +} + +/** + * kdbus_ep_unref() - decrease the reference counter of a kdbus_ep + * @ep: The ep to unref + * + * Release a reference. If the reference count drops to 0, the ep will be + * freed. + * + * Return: NULL + */ +struct kdbus_ep *kdbus_ep_unref(struct kdbus_ep *ep) +{ + if (ep) + kdbus_node_unref(&ep->node); + return NULL; +} + +/** + * kdbus_ep_is_privileged() - check whether a file is privileged + * @ep: endpoint to operate on + * @file: file to test + * + * Return: True if @file is privileged in the domain of @ep. + */ +bool kdbus_ep_is_privileged(struct kdbus_ep *ep, struct file *file) +{ + return !ep->user && + file_ns_capable(file, ep->bus->domain->user_namespace, + CAP_IPC_OWNER); +} + +/** + * kdbus_ep_is_owner() - check whether a file should be treated as bus owner + * @ep: endpoint to operate on + * @file: file to test + * + * Return: True if @file should be treated as bus owner on @ep + */ +bool kdbus_ep_is_owner(struct kdbus_ep *ep, struct file *file) +{ + return !ep->user && + (uid_eq(file->f_cred->euid, ep->bus->node.uid) || + kdbus_ep_is_privileged(ep, file)); +} + +/** + * kdbus_cmd_ep_make() - handle KDBUS_CMD_ENDPOINT_MAKE + * @bus: bus to operate on + * @argp: command payload + * + * Return: NULL or newly created endpoint on success, ERR_PTR on failure. + */ +struct kdbus_ep *kdbus_cmd_ep_make(struct kdbus_bus *bus, void __user *argp) +{ + const char *item_make_name; + struct kdbus_ep *ep = NULL; + struct kdbus_cmd *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_MAKE_NAME, .mandatory = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_MAKE_ACCESS_GROUP | + KDBUS_MAKE_ACCESS_WORLD, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + + item_make_name = argv[1].item->str; + + ep = kdbus_ep_new(bus, item_make_name, cmd->flags, + current_euid(), current_egid(), true); + if (IS_ERR(ep)) { + ret = PTR_ERR(ep); + ep = NULL; + goto exit; + } + + if (!kdbus_node_activate(&ep->node)) { + ret = -ESHUTDOWN; + goto exit; + } + +exit: + ret = kdbus_args_clear(&args, ret); + if (ret < 0) { + if (ep) { + kdbus_node_drain(&ep->node); + kdbus_ep_unref(ep); + } + return ERR_PTR(ret); + } + return ep; +} + +/** + * kdbus_cmd_ep_update() - handle KDBUS_CMD_ENDPOINT_UPDATE + * @ep: endpoint to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_ep_update(struct kdbus_ep *ep, void __user *argp) +{ + struct kdbus_cmd *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_NAME, .multiple = true }, + { .type = KDBUS_ITEM_POLICY_ACCESS, .multiple = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + ret = kdbus_policy_set(&ep->policy_db, args.items, args.items_size, + 0, true, ep); + return kdbus_args_clear(&args, ret); +} diff -Nur linux-4.2/ipc/kdbus/endpoint.h linux-4.2-pck/ipc/kdbus/endpoint.h --- linux-4.2/ipc/kdbus/endpoint.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/endpoint.h 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_ENDPOINT_H +#define __KDBUS_ENDPOINT_H + +#include +#include +#include +#include "node.h" +#include "policy.h" + +struct kdbus_bus; +struct kdbus_user; + +/** + * struct kdbus_ep - endpoint to access a bus + * @node: The kdbus node + * @lock: Endpoint data lock + * @bus: Bus behind this endpoint + * @user: Custom enpoints account against an anonymous user + * @policy_db: Uploaded policy + * @conn_list: Connections of this endpoint + * + * An endpoint offers access to a bus; the default endpoint node name is "bus". + * Additional custom endpoints to the same bus can be created and they can + * carry their own policies/filters. + */ +struct kdbus_ep { + struct kdbus_node node; + struct mutex lock; + + /* static */ + struct kdbus_bus *bus; + struct kdbus_user *user; + + /* protected by own locks */ + struct kdbus_policy_db policy_db; + + /* protected by ep->lock */ + struct list_head conn_list; +}; + +#define kdbus_ep_from_node(_node) \ + container_of((_node), struct kdbus_ep, node) + +struct kdbus_ep *kdbus_ep_new(struct kdbus_bus *bus, const char *name, + unsigned int access, kuid_t uid, kgid_t gid, + bool policy); +struct kdbus_ep *kdbus_ep_ref(struct kdbus_ep *ep); +struct kdbus_ep *kdbus_ep_unref(struct kdbus_ep *ep); + +bool kdbus_ep_is_privileged(struct kdbus_ep *ep, struct file *file); +bool kdbus_ep_is_owner(struct kdbus_ep *ep, struct file *file); + +struct kdbus_ep *kdbus_cmd_ep_make(struct kdbus_bus *bus, void __user *argp); +int kdbus_cmd_ep_update(struct kdbus_ep *ep, void __user *argp); + +#endif diff -Nur linux-4.2/ipc/kdbus/fs.c linux-4.2-pck/ipc/kdbus/fs.c --- linux-4.2/ipc/kdbus/fs.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/fs.c 2015-09-08 00:48:22.231697056 -0300 @@ -0,0 +1,508 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "domain.h" +#include "endpoint.h" +#include "fs.h" +#include "handle.h" +#include "node.h" + +#define kdbus_node_from_dentry(_dentry) \ + ((struct kdbus_node *)(_dentry)->d_fsdata) + +static struct inode *fs_inode_get(struct super_block *sb, + struct kdbus_node *node); + +/* + * Directory Management + */ + +static inline unsigned char kdbus_dt_type(struct kdbus_node *node) +{ + switch (node->type) { + case KDBUS_NODE_DOMAIN: + case KDBUS_NODE_BUS: + return DT_DIR; + case KDBUS_NODE_CONTROL: + case KDBUS_NODE_ENDPOINT: + return DT_REG; + } + + return DT_UNKNOWN; +} + +static int fs_dir_fop_iterate(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct kdbus_node *parent = kdbus_node_from_dentry(dentry); + struct kdbus_node *old, *next = file->private_data; + + /* + * kdbusfs directory iterator (modelled after sysfs/kernfs) + * When iterating kdbusfs directories, we iterate all children of the + * parent kdbus_node object. We use ctx->pos to store the hash of the + * child and file->private_data to store a reference to the next node + * object. If ctx->pos is not modified via llseek while you iterate a + * directory, then we use the file->private_data node pointer to + * directly access the next node in the tree. + * However, if you directly seek on the directory, we have to find the + * closest node to that position and cannot use our node pointer. This + * means iterating the rb-tree to find the closest match and start over + * from there. + * Note that hash values are not necessarily unique. Therefore, llseek + * is not guaranteed to seek to the same node that you got when you + * retrieved the position. Seeking to 0, 1, 2 and >=INT_MAX is safe, + * though. We could use the inode-number as position, but this would + * require another rb-tree for fast access. Kernfs and others already + * ignore those conflicts, so we should be fine, too. + */ + + if (!dir_emit_dots(file, ctx)) + return 0; + + /* acquire @next; if deactivated, or seek detected, find next node */ + old = next; + if (next && ctx->pos == next->hash) { + if (kdbus_node_acquire(next)) + kdbus_node_ref(next); + else + next = kdbus_node_next_child(parent, next); + } else { + next = kdbus_node_find_closest(parent, ctx->pos); + } + kdbus_node_unref(old); + + while (next) { + /* emit @next */ + file->private_data = next; + ctx->pos = next->hash; + + kdbus_node_release(next); + + if (!dir_emit(ctx, next->name, strlen(next->name), next->id, + kdbus_dt_type(next))) + return 0; + + /* find next node after @next */ + old = next; + next = kdbus_node_next_child(parent, next); + kdbus_node_unref(old); + } + + file->private_data = NULL; + ctx->pos = INT_MAX; + + return 0; +} + +static loff_t fs_dir_fop_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + /* protect f_off against fop_iterate */ + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} + +static int fs_dir_fop_release(struct inode *inode, struct file *file) +{ + kdbus_node_unref(file->private_data); + return 0; +} + +static const struct file_operations fs_dir_fops = { + .read = generic_read_dir, + .iterate = fs_dir_fop_iterate, + .llseek = fs_dir_fop_llseek, + .release = fs_dir_fop_release, +}; + +static struct dentry *fs_dir_iop_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct dentry *dnew = NULL; + struct kdbus_node *parent; + struct kdbus_node *node; + struct inode *inode; + + parent = kdbus_node_from_dentry(dentry->d_parent); + if (!kdbus_node_acquire(parent)) + return NULL; + + /* returns reference to _acquired_ child node */ + node = kdbus_node_find_child(parent, dentry->d_name.name); + if (node) { + dentry->d_fsdata = node; + inode = fs_inode_get(dir->i_sb, node); + if (IS_ERR(inode)) + dnew = ERR_CAST(inode); + else + dnew = d_splice_alias(inode, dentry); + + kdbus_node_release(node); + } + + kdbus_node_release(parent); + return dnew; +} + +static const struct inode_operations fs_dir_iops = { + .permission = generic_permission, + .lookup = fs_dir_iop_lookup, +}; + +/* + * Inode Management + */ + +static const struct inode_operations fs_inode_iops = { + .permission = generic_permission, +}; + +static struct inode *fs_inode_get(struct super_block *sb, + struct kdbus_node *node) +{ + struct inode *inode; + + inode = iget_locked(sb, node->id); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + inode->i_private = kdbus_node_ref(node); + inode->i_mapping->a_ops = &empty_aops; + inode->i_mode = node->mode & S_IALLUGO; + inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode->i_uid = node->uid; + inode->i_gid = node->gid; + + switch (node->type) { + case KDBUS_NODE_DOMAIN: + case KDBUS_NODE_BUS: + inode->i_mode |= S_IFDIR; + inode->i_op = &fs_dir_iops; + inode->i_fop = &fs_dir_fops; + set_nlink(inode, 2); + break; + case KDBUS_NODE_CONTROL: + case KDBUS_NODE_ENDPOINT: + inode->i_mode |= S_IFREG; + inode->i_op = &fs_inode_iops; + inode->i_fop = &kdbus_handle_ops; + break; + } + + unlock_new_inode(inode); + + return inode; +} + +/* + * Superblock Management + */ + +static int fs_super_dop_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct kdbus_node *node; + + /* Force lookup on negatives */ + if (!dentry->d_inode) + return 0; + + node = kdbus_node_from_dentry(dentry); + + /* see whether the node has been removed */ + if (!kdbus_node_is_active(node)) + return 0; + + return 1; +} + +static void fs_super_dop_release(struct dentry *dentry) +{ + kdbus_node_unref(dentry->d_fsdata); +} + +static const struct dentry_operations fs_super_dops = { + .d_revalidate = fs_super_dop_revalidate, + .d_release = fs_super_dop_release, +}; + +static void fs_super_sop_evict_inode(struct inode *inode) +{ + struct kdbus_node *node = kdbus_node_from_inode(inode); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + kdbus_node_unref(node); +} + +static const struct super_operations fs_super_sops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = fs_super_sop_evict_inode, +}; + +static int fs_super_fill(struct super_block *sb) +{ + struct kdbus_domain *domain = sb->s_fs_info; + struct inode *inode; + int ret; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = KDBUS_SUPER_MAGIC; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_op = &fs_super_sops; + sb->s_time_gran = 1; + + inode = fs_inode_get(sb, &domain->node); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + /* d_make_root iput()s the inode on failure */ + return -ENOMEM; + } + + /* sb holds domain reference */ + sb->s_root->d_fsdata = &domain->node; + sb->s_d_op = &fs_super_dops; + + /* sb holds root reference */ + domain->dentry = sb->s_root; + + if (!kdbus_node_activate(&domain->node)) + return -ESHUTDOWN; + + ret = kdbus_domain_populate(domain, KDBUS_MAKE_ACCESS_WORLD); + if (ret < 0) + return ret; + + sb->s_flags |= MS_ACTIVE; + return 0; +} + +static void fs_super_kill(struct super_block *sb) +{ + struct kdbus_domain *domain = sb->s_fs_info; + + if (domain) { + kdbus_node_drain(&domain->node); + domain->dentry = NULL; + } + + kill_anon_super(sb); + kdbus_domain_unref(domain); +} + +static int fs_super_set(struct super_block *sb, void *data) +{ + int ret; + + ret = set_anon_super(sb, data); + if (!ret) + sb->s_fs_info = data; + + return ret; +} + +static struct dentry *fs_super_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + struct kdbus_domain *domain; + struct super_block *sb; + int ret; + + domain = kdbus_domain_new(KDBUS_MAKE_ACCESS_WORLD); + if (IS_ERR(domain)) + return ERR_CAST(domain); + + sb = sget(fs_type, NULL, fs_super_set, flags, domain); + if (IS_ERR(sb)) { + kdbus_node_drain(&domain->node); + kdbus_domain_unref(domain); + return ERR_CAST(sb); + } + + WARN_ON(sb->s_fs_info != domain); + WARN_ON(sb->s_root); + + ret = fs_super_fill(sb); + if (ret < 0) { + /* calls into ->kill_sb() when done */ + deactivate_locked_super(sb); + return ERR_PTR(ret); + } + + return dget(sb->s_root); +} + +static struct file_system_type fs_type = { + .name = KBUILD_MODNAME "fs", + .owner = THIS_MODULE, + .mount = fs_super_mount, + .kill_sb = fs_super_kill, + .fs_flags = FS_USERNS_MOUNT, +}; + +/** + * kdbus_fs_init() - register kdbus filesystem + * + * This registers a filesystem with the VFS layer. The filesystem is called + * `KBUILD_MODNAME "fs"', which usually resolves to `kdbusfs'. The nameing + * scheme allows to set KBUILD_MODNAME to "kdbus2" and you will get an + * independent filesystem for developers. + * + * Each mount of the kdbusfs filesystem has an kdbus_domain attached. + * Operations on this mount will only affect the attached domain. On each mount + * a new domain is automatically created and used for this mount exclusively. + * If you want to share a domain across multiple mounts, you need to bind-mount + * it. + * + * Mounts of kdbusfs (with a different domain each) are unrelated to each other + * and will never have any effect on any domain but their own. + * + * Return: 0 on success, negative error otherwise. + */ +int kdbus_fs_init(void) +{ + return register_filesystem(&fs_type); +} + +/** + * kdbus_fs_exit() - unregister kdbus filesystem + * + * This does the reverse to kdbus_fs_init(). It unregisters the kdbusfs + * filesystem from VFS and cleans up any allocated resources. + */ +void kdbus_fs_exit(void) +{ + unregister_filesystem(&fs_type); +} + +/* acquire domain of @node, making sure all ancestors are active */ +static struct kdbus_domain *fs_acquire_domain(struct kdbus_node *node) +{ + struct kdbus_domain *domain; + struct kdbus_node *iter; + + /* caller must guarantee that @node is linked */ + for (iter = node; iter->parent; iter = iter->parent) + if (!kdbus_node_is_active(iter->parent)) + return NULL; + + /* root nodes are always domains */ + if (WARN_ON(iter->type != KDBUS_NODE_DOMAIN)) + return NULL; + + domain = kdbus_domain_from_node(iter); + if (!kdbus_node_acquire(&domain->node)) + return NULL; + + return domain; +} + +/** + * kdbus_fs_flush() - flush dcache entries of a node + * @node: Node to flush entries of + * + * This flushes all VFS filesystem cache entries for a node and all its + * children. This should be called whenever a node is destroyed during + * runtime. It will flush the cache entries so the linked objects can be + * deallocated. + * + * This is a no-op if you call it on active nodes (they really should stay in + * cache) or on nodes with deactivated parents (flushing the parent is enough). + * Furthermore, there is no need to call it on nodes whose lifetime is bound to + * their parents'. In those cases, the parent-flush will always also flush the + * children. + */ +void kdbus_fs_flush(struct kdbus_node *node) +{ + struct dentry *dentry, *parent_dentry = NULL; + struct kdbus_domain *domain; + struct qstr name; + + /* active nodes should remain in cache */ + if (!kdbus_node_is_deactivated(node)) + return; + + /* nodes that were never linked were never instantiated */ + if (!node->parent) + return; + + /* acquire domain and verify all ancestors are active */ + domain = fs_acquire_domain(node); + if (!domain) + return; + + switch (node->type) { + case KDBUS_NODE_ENDPOINT: + if (WARN_ON(!node->parent || !node->parent->name)) + goto exit; + + name.name = node->parent->name; + name.len = strlen(node->parent->name); + parent_dentry = d_hash_and_lookup(domain->dentry, &name); + if (IS_ERR_OR_NULL(parent_dentry)) + goto exit; + + /* fallthrough */ + case KDBUS_NODE_BUS: + if (WARN_ON(!node->name)) + goto exit; + + name.name = node->name; + name.len = strlen(node->name); + dentry = d_hash_and_lookup(parent_dentry ? : domain->dentry, + &name); + if (!IS_ERR_OR_NULL(dentry)) { + d_invalidate(dentry); + dput(dentry); + } + + dput(parent_dentry); + break; + + default: + /* all other types are bound to their parent lifetime */ + break; + } + +exit: + kdbus_node_release(&domain->node); +} diff -Nur linux-4.2/ipc/kdbus/fs.h linux-4.2-pck/ipc/kdbus/fs.h --- linux-4.2/ipc/kdbus/fs.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/fs.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUSFS_H +#define __KDBUSFS_H + +#include + +struct kdbus_node; + +int kdbus_fs_init(void); +void kdbus_fs_exit(void); +void kdbus_fs_flush(struct kdbus_node *node); + +#define kdbus_node_from_inode(_inode) \ + ((struct kdbus_node *)(_inode)->i_private) + +#endif diff -Nur linux-4.2/ipc/kdbus/handle.c linux-4.2-pck/ipc/kdbus/handle.c --- linux-4.2/ipc/kdbus/handle.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/handle.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,691 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "fs.h" +#include "handle.h" +#include "item.h" +#include "match.h" +#include "message.h" +#include "names.h" +#include "domain.h" +#include "policy.h" + +static int kdbus_args_verify(struct kdbus_args *args) +{ + struct kdbus_item *item; + size_t i; + int ret; + + KDBUS_ITEMS_FOREACH(item, args->items, args->items_size) { + struct kdbus_arg *arg = NULL; + + if (!KDBUS_ITEM_VALID(item, args->items, args->items_size)) + return -EINVAL; + + for (i = 0; i < args->argc; ++i) + if (args->argv[i].type == item->type) + break; + if (i >= args->argc) + return -EINVAL; + + arg = &args->argv[i]; + + ret = kdbus_item_validate(item); + if (ret < 0) + return ret; + + if (arg->item && !arg->multiple) + return -EINVAL; + + arg->item = item; + } + + if (!KDBUS_ITEMS_END(item, args->items, args->items_size)) + return -EINVAL; + + return 0; +} + +static int kdbus_args_negotiate(struct kdbus_args *args) +{ + struct kdbus_item __user *user; + struct kdbus_item *negotiation; + size_t i, j, num; + + /* + * If KDBUS_FLAG_NEGOTIATE is set, we overwrite the flags field with + * the set of supported flags. Furthermore, if an KDBUS_ITEM_NEGOTIATE + * item is passed, we iterate its payload (array of u64, each set to an + * item type) and clear all unsupported item-types to 0. + * The caller might do this recursively, if other flags or objects are + * embedded in the payload itself. + */ + + if (args->cmd->flags & KDBUS_FLAG_NEGOTIATE) { + if (put_user(args->allowed_flags & ~KDBUS_FLAG_NEGOTIATE, + &args->user->flags)) + return -EFAULT; + } + + if (args->argc < 1 || args->argv[0].type != KDBUS_ITEM_NEGOTIATE || + !args->argv[0].item) + return 0; + + negotiation = args->argv[0].item; + user = (struct kdbus_item __user *) + ((u8 __user *)args->user + + ((u8 *)negotiation - (u8 *)args->cmd)); + num = KDBUS_ITEM_PAYLOAD_SIZE(negotiation) / sizeof(u64); + + for (i = 0; i < num; ++i) { + for (j = 0; j < args->argc; ++j) + if (negotiation->data64[i] == args->argv[j].type) + break; + + if (j < args->argc) + continue; + + /* this item is not supported, clear it out */ + negotiation->data64[i] = 0; + if (put_user(negotiation->data64[i], &user->data64[i])) + return -EFAULT; + } + + return 0; +} + +/** + * __kdbus_args_parse() - parse payload of kdbus command + * @args: object to parse data into + * @is_cmd: whether this is a command or msg payload + * @argp: user-space location of command payload to parse + * @type_size: overall size of command payload to parse + * @items_offset: offset of items array in command payload + * @out: output variable to store pointer to copied payload + * + * This parses the ioctl payload at user-space location @argp into @args. @args + * must be pre-initialized by the caller to reflect the supported flags and + * items of this command. This parser will then copy the command payload into + * kernel-space, verify correctness and consistency and cache pointers to parsed + * items and other data in @args. + * + * If this function succeeded, you must call kdbus_args_clear() to release + * allocated resources before destroying @args. + * + * This can also be used to import kdbus_msg objects. In that case, @is_cmd must + * be set to 'false' and the 'return_flags' field will not be touched (as it + * doesn't exist on kdbus_msg). + * + * Return: On failure a negative error code is returned. Otherwise, 1 is + * returned if negotiation was requested, 0 if not. + */ +int __kdbus_args_parse(struct kdbus_args *args, bool is_cmd, void __user *argp, + size_t type_size, size_t items_offset, void **out) +{ + u64 user_size; + int ret, i; + + ret = kdbus_copy_from_user(&user_size, argp, sizeof(user_size)); + if (ret < 0) + return ret; + + if (user_size < type_size) + return -EINVAL; + if (user_size > KDBUS_CMD_MAX_SIZE) + return -EMSGSIZE; + + if (user_size <= sizeof(args->cmd_buf)) { + if (copy_from_user(args->cmd_buf, argp, user_size)) + return -EFAULT; + args->cmd = (void*)args->cmd_buf; + } else { + args->cmd = memdup_user(argp, user_size); + if (IS_ERR(args->cmd)) + return PTR_ERR(args->cmd); + } + + if (args->cmd->size != user_size) { + ret = -EINVAL; + goto error; + } + + if (is_cmd) + args->cmd->return_flags = 0; + args->user = argp; + args->items = (void *)((u8 *)args->cmd + items_offset); + args->items_size = args->cmd->size - items_offset; + args->is_cmd = is_cmd; + + if (args->cmd->flags & ~args->allowed_flags) { + ret = -EINVAL; + goto error; + } + + ret = kdbus_args_verify(args); + if (ret < 0) + goto error; + + ret = kdbus_args_negotiate(args); + if (ret < 0) + goto error; + + /* mandatory items must be given (but not on negotiation) */ + if (!(args->cmd->flags & KDBUS_FLAG_NEGOTIATE)) { + for (i = 0; i < args->argc; ++i) + if (args->argv[i].mandatory && !args->argv[i].item) { + ret = -EINVAL; + goto error; + } + } + + *out = args->cmd; + return !!(args->cmd->flags & KDBUS_FLAG_NEGOTIATE); + +error: + return kdbus_args_clear(args, ret); +} + +/** + * kdbus_args_clear() - release allocated command resources + * @args: object to release resources of + * @ret: return value of this command + * + * This frees all allocated resources on @args and copies the command result + * flags into user-space. @ret is usually returned unchanged by this function, + * so it can be used in the final 'return' statement of the command handler. + * + * Return: -EFAULT if return values cannot be copied into user-space, otherwise + * @ret is returned unchanged. + */ +int kdbus_args_clear(struct kdbus_args *args, int ret) +{ + if (!args) + return ret; + + if (!IS_ERR_OR_NULL(args->cmd)) { + if (args->is_cmd && put_user(args->cmd->return_flags, + &args->user->return_flags)) + ret = -EFAULT; + if (args->cmd != (void*)args->cmd_buf) + kfree(args->cmd); + args->cmd = NULL; + } + + return ret; +} + +/** + * enum kdbus_handle_type - type an handle can be of + * @KDBUS_HANDLE_NONE: no type set, yet + * @KDBUS_HANDLE_BUS_OWNER: bus owner + * @KDBUS_HANDLE_EP_OWNER: endpoint owner + * @KDBUS_HANDLE_CONNECTED: endpoint connection after HELLO + */ +enum kdbus_handle_type { + KDBUS_HANDLE_NONE, + KDBUS_HANDLE_BUS_OWNER, + KDBUS_HANDLE_EP_OWNER, + KDBUS_HANDLE_CONNECTED, +}; + +/** + * struct kdbus_handle - handle to the kdbus system + * @lock: handle lock + * @type: type of this handle (KDBUS_HANDLE_*) + * @bus_owner: bus this handle owns + * @ep_owner: endpoint this handle owns + * @conn: connection this handle owns + */ +struct kdbus_handle { + struct mutex lock; + + enum kdbus_handle_type type; + union { + struct kdbus_bus *bus_owner; + struct kdbus_ep *ep_owner; + struct kdbus_conn *conn; + }; +}; + +static int kdbus_handle_open(struct inode *inode, struct file *file) +{ + struct kdbus_handle *handle; + struct kdbus_node *node; + int ret; + + node = kdbus_node_from_inode(inode); + if (!kdbus_node_acquire(node)) + return -ESHUTDOWN; + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) { + ret = -ENOMEM; + goto exit; + } + + mutex_init(&handle->lock); + handle->type = KDBUS_HANDLE_NONE; + + file->private_data = handle; + ret = 0; + +exit: + kdbus_node_release(node); + return ret; +} + +static int kdbus_handle_release(struct inode *inode, struct file *file) +{ + struct kdbus_handle *handle = file->private_data; + + switch (handle->type) { + case KDBUS_HANDLE_BUS_OWNER: + if (handle->bus_owner) { + kdbus_node_drain(&handle->bus_owner->node); + kdbus_bus_unref(handle->bus_owner); + } + break; + case KDBUS_HANDLE_EP_OWNER: + if (handle->ep_owner) { + kdbus_node_drain(&handle->ep_owner->node); + kdbus_ep_unref(handle->ep_owner); + } + break; + case KDBUS_HANDLE_CONNECTED: + kdbus_conn_disconnect(handle->conn, false); + kdbus_conn_unref(handle->conn); + break; + case KDBUS_HANDLE_NONE: + /* nothing to clean up */ + break; + } + + kfree(handle); + + return 0; +} + +static long kdbus_handle_ioctl_control(struct file *file, unsigned int cmd, + void __user *argp) +{ + struct kdbus_handle *handle = file->private_data; + struct kdbus_node *node = file_inode(file)->i_private; + struct kdbus_domain *domain; + int ret = 0; + + if (!kdbus_node_acquire(node)) + return -ESHUTDOWN; + + /* + * The parent of control-nodes is always a domain, make sure to pin it + * so the parent is actually valid. + */ + domain = kdbus_domain_from_node(node->parent); + if (!kdbus_node_acquire(&domain->node)) { + kdbus_node_release(node); + return -ESHUTDOWN; + } + + switch (cmd) { + case KDBUS_CMD_BUS_MAKE: { + struct kdbus_bus *bus; + + bus = kdbus_cmd_bus_make(domain, argp); + if (IS_ERR_OR_NULL(bus)) { + ret = PTR_ERR_OR_ZERO(bus); + break; + } + + handle->bus_owner = bus; + ret = KDBUS_HANDLE_BUS_OWNER; + break; + } + + default: + ret = -EBADFD; + break; + } + + kdbus_node_release(&domain->node); + kdbus_node_release(node); + return ret; +} + +static long kdbus_handle_ioctl_ep(struct file *file, unsigned int cmd, + void __user *buf) +{ + struct kdbus_handle *handle = file->private_data; + struct kdbus_node *node = file_inode(file)->i_private; + struct kdbus_ep *ep, *file_ep = kdbus_ep_from_node(node); + struct kdbus_bus *bus = file_ep->bus; + struct kdbus_conn *conn; + int ret = 0; + + if (!kdbus_node_acquire(node)) + return -ESHUTDOWN; + + switch (cmd) { + case KDBUS_CMD_ENDPOINT_MAKE: { + /* creating custom endpoints is a privileged operation */ + if (!kdbus_ep_is_owner(file_ep, file)) { + ret = -EPERM; + break; + } + + ep = kdbus_cmd_ep_make(bus, buf); + if (IS_ERR_OR_NULL(ep)) { + ret = PTR_ERR_OR_ZERO(ep); + break; + } + + handle->ep_owner = ep; + ret = KDBUS_HANDLE_EP_OWNER; + break; + } + + case KDBUS_CMD_HELLO: + conn = kdbus_cmd_hello(file_ep, file, buf); + if (IS_ERR_OR_NULL(conn)) { + ret = PTR_ERR_OR_ZERO(conn); + break; + } + + handle->conn = conn; + ret = KDBUS_HANDLE_CONNECTED; + break; + + default: + ret = -EBADFD; + break; + } + + kdbus_node_release(node); + return ret; +} + +static long kdbus_handle_ioctl_ep_owner(struct file *file, unsigned int command, + void __user *buf) +{ + struct kdbus_handle *handle = file->private_data; + struct kdbus_ep *ep = handle->ep_owner; + int ret; + + if (!kdbus_node_acquire(&ep->node)) + return -ESHUTDOWN; + + switch (command) { + case KDBUS_CMD_ENDPOINT_UPDATE: + ret = kdbus_cmd_ep_update(ep, buf); + break; + default: + ret = -EBADFD; + break; + } + + kdbus_node_release(&ep->node); + return ret; +} + +static long kdbus_handle_ioctl_connected(struct file *file, + unsigned int command, void __user *buf) +{ + struct kdbus_handle *handle = file->private_data; + struct kdbus_conn *conn = handle->conn; + struct kdbus_conn *release_conn = NULL; + int ret; + + release_conn = conn; + ret = kdbus_conn_acquire(release_conn); + if (ret < 0) + return ret; + + switch (command) { + case KDBUS_CMD_BYEBYE: + /* + * BYEBYE is special; we must not acquire a connection when + * calling into kdbus_conn_disconnect() or we will deadlock, + * because kdbus_conn_disconnect() will wait for all acquired + * references to be dropped. + */ + kdbus_conn_release(release_conn); + release_conn = NULL; + ret = kdbus_cmd_byebye_unlocked(conn, buf); + break; + case KDBUS_CMD_NAME_ACQUIRE: + ret = kdbus_cmd_name_acquire(conn, buf); + break; + case KDBUS_CMD_NAME_RELEASE: + ret = kdbus_cmd_name_release(conn, buf); + break; + case KDBUS_CMD_LIST: + ret = kdbus_cmd_list(conn, buf); + break; + case KDBUS_CMD_CONN_INFO: + ret = kdbus_cmd_conn_info(conn, buf); + break; + case KDBUS_CMD_BUS_CREATOR_INFO: + ret = kdbus_cmd_bus_creator_info(conn, buf); + break; + case KDBUS_CMD_UPDATE: + ret = kdbus_cmd_update(conn, buf); + break; + case KDBUS_CMD_MATCH_ADD: + ret = kdbus_cmd_match_add(conn, buf); + break; + case KDBUS_CMD_MATCH_REMOVE: + ret = kdbus_cmd_match_remove(conn, buf); + break; + case KDBUS_CMD_SEND: + ret = kdbus_cmd_send(conn, file, buf); + break; + case KDBUS_CMD_RECV: + ret = kdbus_cmd_recv(conn, buf); + break; + case KDBUS_CMD_FREE: + ret = kdbus_cmd_free(conn, buf); + break; + default: + ret = -EBADFD; + break; + } + + kdbus_conn_release(release_conn); + return ret; +} + +static long kdbus_handle_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct kdbus_handle *handle = file->private_data; + struct kdbus_node *node = kdbus_node_from_inode(file_inode(file)); + void __user *argp = (void __user *)arg; + long ret = -EBADFD; + + switch (cmd) { + case KDBUS_CMD_BUS_MAKE: + case KDBUS_CMD_ENDPOINT_MAKE: + case KDBUS_CMD_HELLO: + mutex_lock(&handle->lock); + if (handle->type == KDBUS_HANDLE_NONE) { + if (node->type == KDBUS_NODE_CONTROL) + ret = kdbus_handle_ioctl_control(file, cmd, + argp); + else if (node->type == KDBUS_NODE_ENDPOINT) + ret = kdbus_handle_ioctl_ep(file, cmd, argp); + + if (ret > 0) { + /* + * The data given via open() is not sufficient + * to setup a kdbus handle. Hence, we require + * the user to perform a setup ioctl. This setup + * can only be performed once and defines the + * type of the handle. The different setup + * ioctls are locked against each other so they + * cannot race. Once the handle type is set, + * the type-dependent ioctls are enabled. To + * improve performance, we don't lock those via + * handle->lock. Instead, we issue a + * write-barrier before performing the + * type-change, which pairs with smp_rmb() in + * all handlers that access the type field. This + * guarantees the handle is fully setup, if + * handle->type is set. If handle->type is + * unset, you must not make any assumptions + * without taking handle->lock. + * Note that handle->type is only set once. It + * will never change afterwards. + */ + smp_wmb(); + handle->type = ret; + } + } + mutex_unlock(&handle->lock); + break; + + case KDBUS_CMD_ENDPOINT_UPDATE: + case KDBUS_CMD_BYEBYE: + case KDBUS_CMD_NAME_ACQUIRE: + case KDBUS_CMD_NAME_RELEASE: + case KDBUS_CMD_LIST: + case KDBUS_CMD_CONN_INFO: + case KDBUS_CMD_BUS_CREATOR_INFO: + case KDBUS_CMD_UPDATE: + case KDBUS_CMD_MATCH_ADD: + case KDBUS_CMD_MATCH_REMOVE: + case KDBUS_CMD_SEND: + case KDBUS_CMD_RECV: + case KDBUS_CMD_FREE: { + enum kdbus_handle_type type; + + /* + * This read-barrier pairs with smp_wmb() of the handle setup. + * it guarantees the handle is fully written, in case the + * type has been set. It allows us to access the handle without + * taking handle->lock, given the guarantee that the type is + * only ever set once, and stays constant afterwards. + * Furthermore, the handle object itself is not modified in any + * way after the type is set. That is, the type-field is the + * last field that is written on any handle. If it has not been + * set, we must not access the handle here. + */ + type = handle->type; + smp_rmb(); + + if (type == KDBUS_HANDLE_EP_OWNER) + ret = kdbus_handle_ioctl_ep_owner(file, cmd, argp); + else if (type == KDBUS_HANDLE_CONNECTED) + ret = kdbus_handle_ioctl_connected(file, cmd, argp); + + break; + } + default: + ret = -ENOTTY; + break; + } + + return ret < 0 ? ret : 0; +} + +static unsigned int kdbus_handle_poll(struct file *file, + struct poll_table_struct *wait) +{ + struct kdbus_handle *handle = file->private_data; + enum kdbus_handle_type type; + unsigned int mask = POLLOUT | POLLWRNORM; + + /* + * This pairs with smp_wmb() during handle setup. It guarantees that + * _iff_ the handle type is set, handle->conn is valid. Furthermore, + * _iff_ the type is set, the handle object is constant and never + * changed again. If it's not set, we must not access the handle but + * bail out. We also must assume no setup has taken place, yet. + */ + type = handle->type; + smp_rmb(); + + /* Only a connected endpoint can read/write data */ + if (type != KDBUS_HANDLE_CONNECTED) + return POLLERR | POLLHUP; + + poll_wait(file, &handle->conn->wait, wait); + + /* + * Verify the connection hasn't been deactivated _after_ adding the + * wait-queue. This guarantees, that if the connection is deactivated + * after we checked it, the waitqueue is signaled and we're called + * again. + */ + if (!kdbus_conn_active(handle->conn)) + return POLLERR | POLLHUP; + + if (!list_empty(&handle->conn->queue.msg_list) || + atomic_read(&handle->conn->lost_count) > 0) + mask |= POLLIN | POLLRDNORM; + + return mask; +} + +static int kdbus_handle_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct kdbus_handle *handle = file->private_data; + enum kdbus_handle_type type; + int ret = -EBADFD; + + /* + * This pairs with smp_wmb() during handle setup. It guarantees that + * _iff_ the handle type is set, handle->conn is valid. Furthermore, + * _iff_ the type is set, the handle object is constant and never + * changed again. If it's not set, we must not access the handle but + * bail out. We also must assume no setup has taken place, yet. + */ + type = handle->type; + smp_rmb(); + + /* Only connected handles have a pool we can map */ + if (type == KDBUS_HANDLE_CONNECTED) + ret = kdbus_pool_mmap(handle->conn->pool, vma); + + return ret; +} + +const struct file_operations kdbus_handle_ops = { + .owner = THIS_MODULE, + .open = kdbus_handle_open, + .release = kdbus_handle_release, + .poll = kdbus_handle_poll, + .llseek = noop_llseek, + .unlocked_ioctl = kdbus_handle_ioctl, + .mmap = kdbus_handle_mmap, +#ifdef CONFIG_COMPAT + .compat_ioctl = kdbus_handle_ioctl, +#endif +}; diff -Nur linux-4.2/ipc/kdbus/handle.h linux-4.2-pck/ipc/kdbus/handle.h --- linux-4.2/ipc/kdbus/handle.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/handle.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_HANDLE_H +#define __KDBUS_HANDLE_H + +#include +#include + +extern const struct file_operations kdbus_handle_ops; + +/** + * kdbus_arg - information and state of a single ioctl command item + * @type: item type + * @item: set by the parser to the first found item of this type + * @multiple: whether multiple items of this type are allowed + * @mandatory: whether at least one item of this type is required + * + * This structure describes a single item in an ioctl command payload. The + * caller has to pre-fill the type and flags, the parser will then use this + * information to verify the ioctl payload. @item is set by the parser to point + * to the first occurrence of the item. + */ +struct kdbus_arg { + u64 type; + struct kdbus_item *item; + bool multiple : 1; + bool mandatory : 1; +}; + +/** + * kdbus_args - information and state of ioctl command parser + * @allowed_flags: set of flags this command supports + * @argc: number of items in @argv + * @argv: array of items this command supports + * @user: set by parser to user-space location of current command + * @cmd: set by parser to kernel copy of command payload + * @cmd_buf: inline buf to avoid kmalloc() on small cmds + * @items: points to item array in @cmd + * @items_size: size of @items in bytes + * @is_cmd: whether this is a command-payload or msg-payload + * + * This structure is used to parse ioctl command payloads on each invocation. + * The ioctl handler has to pre-fill the flags and allowed items before passing + * the object to kdbus_args_parse(). The parser will copy the command payload + * into kernel-space and verify the correctness of the data. + * + * We use a 256 bytes buffer for small command payloads, to be allocated on + * stack on syscall entrance. + */ +struct kdbus_args { + u64 allowed_flags; + size_t argc; + struct kdbus_arg *argv; + + struct kdbus_cmd __user *user; + struct kdbus_cmd *cmd; + u8 cmd_buf[256]; + + struct kdbus_item *items; + size_t items_size; + bool is_cmd : 1; +}; + +int __kdbus_args_parse(struct kdbus_args *args, bool is_cmd, void __user *argp, + size_t type_size, size_t items_offset, void **out); +int kdbus_args_clear(struct kdbus_args *args, int ret); + +#define kdbus_args_parse(_args, _argp, _v) \ + ({ \ + BUILD_BUG_ON(offsetof(typeof(**(_v)), size) != \ + offsetof(struct kdbus_cmd, size)); \ + BUILD_BUG_ON(offsetof(typeof(**(_v)), flags) != \ + offsetof(struct kdbus_cmd, flags)); \ + BUILD_BUG_ON(offsetof(typeof(**(_v)), return_flags) != \ + offsetof(struct kdbus_cmd, return_flags)); \ + __kdbus_args_parse((_args), 1, (_argp), sizeof(**(_v)), \ + offsetof(typeof(**(_v)), items), \ + (void **)(_v)); \ + }) + +#define kdbus_args_parse_msg(_args, _argp, _v) \ + ({ \ + BUILD_BUG_ON(offsetof(typeof(**(_v)), size) != \ + offsetof(struct kdbus_cmd, size)); \ + BUILD_BUG_ON(offsetof(typeof(**(_v)), flags) != \ + offsetof(struct kdbus_cmd, flags)); \ + __kdbus_args_parse((_args), 0, (_argp), sizeof(**(_v)), \ + offsetof(typeof(**(_v)), items), \ + (void **)(_v)); \ + }) + +#endif diff -Nur linux-4.2/ipc/kdbus/item.c linux-4.2-pck/ipc/kdbus/item.c --- linux-4.2/ipc/kdbus/item.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/item.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,293 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include + +#include "item.h" +#include "limits.h" +#include "util.h" + +/* + * This verifies the string at position @str with size @size is properly + * zero-terminated and does not contain a 0-byte but at the end. + */ +static bool kdbus_str_valid(const char *str, size_t size) +{ + return size > 0 && memchr(str, '\0', size) == str + size - 1; +} + +/** + * kdbus_item_validate_name() - validate an item containing a name + * @item: Item to validate + * + * Return: zero on success or an negative error code on failure + */ +int kdbus_item_validate_name(const struct kdbus_item *item) +{ + const char *name = item->str; + unsigned int i; + size_t len; + + if (item->size < KDBUS_ITEM_HEADER_SIZE + 2) + return -EINVAL; + + if (item->size > KDBUS_ITEM_HEADER_SIZE + + KDBUS_SYSNAME_MAX_LEN + 1) + return -ENAMETOOLONG; + + if (!kdbus_str_valid(name, KDBUS_ITEM_PAYLOAD_SIZE(item))) + return -EINVAL; + + len = strlen(name); + if (len == 0) + return -EINVAL; + + for (i = 0; i < len; i++) { + if (isalpha(name[i])) + continue; + if (isdigit(name[i])) + continue; + if (name[i] == '_') + continue; + if (i > 0 && i + 1 < len && (name[i] == '-' || name[i] == '.')) + continue; + + return -EINVAL; + } + + return 0; +} + +/** + * kdbus_item_validate() - validate a single item + * @item: item to validate + * + * Return: 0 if item is valid, negative error code if not. + */ +int kdbus_item_validate(const struct kdbus_item *item) +{ + size_t payload_size = KDBUS_ITEM_PAYLOAD_SIZE(item); + size_t l; + int ret; + + BUILD_BUG_ON(KDBUS_ITEM_HEADER_SIZE != + sizeof(struct kdbus_item_header)); + + if (item->size < KDBUS_ITEM_HEADER_SIZE) + return -EINVAL; + + switch (item->type) { + case KDBUS_ITEM_NEGOTIATE: + if (payload_size % sizeof(u64) != 0) + return -EINVAL; + break; + + case KDBUS_ITEM_PAYLOAD_VEC: + case KDBUS_ITEM_PAYLOAD_OFF: + if (payload_size != sizeof(struct kdbus_vec)) + return -EINVAL; + if (item->vec.size == 0 || item->vec.size > SIZE_MAX) + return -EINVAL; + break; + + case KDBUS_ITEM_PAYLOAD_MEMFD: + if (payload_size != sizeof(struct kdbus_memfd)) + return -EINVAL; + if (item->memfd.size == 0 || item->memfd.size > SIZE_MAX) + return -EINVAL; + if (item->memfd.fd < 0) + return -EBADF; + break; + + case KDBUS_ITEM_FDS: + if (payload_size % sizeof(int) != 0) + return -EINVAL; + break; + + case KDBUS_ITEM_CANCEL_FD: + if (payload_size != sizeof(int)) + return -EINVAL; + break; + + case KDBUS_ITEM_BLOOM_PARAMETER: + if (payload_size != sizeof(struct kdbus_bloom_parameter)) + return -EINVAL; + break; + + case KDBUS_ITEM_BLOOM_FILTER: + /* followed by the bloom-mask, depends on the bloom-size */ + if (payload_size < sizeof(struct kdbus_bloom_filter)) + return -EINVAL; + break; + + case KDBUS_ITEM_BLOOM_MASK: + /* size depends on bloom-size of bus */ + break; + + case KDBUS_ITEM_CONN_DESCRIPTION: + case KDBUS_ITEM_MAKE_NAME: + ret = kdbus_item_validate_name(item); + if (ret < 0) + return ret; + break; + + case KDBUS_ITEM_ATTACH_FLAGS_SEND: + case KDBUS_ITEM_ATTACH_FLAGS_RECV: + case KDBUS_ITEM_ID: + case KDBUS_ITEM_DST_ID: + if (payload_size != sizeof(u64)) + return -EINVAL; + break; + + case KDBUS_ITEM_TIMESTAMP: + if (payload_size != sizeof(struct kdbus_timestamp)) + return -EINVAL; + break; + + case KDBUS_ITEM_CREDS: + if (payload_size != sizeof(struct kdbus_creds)) + return -EINVAL; + break; + + case KDBUS_ITEM_AUXGROUPS: + if (payload_size % sizeof(u32) != 0) + return -EINVAL; + break; + + case KDBUS_ITEM_NAME: + case KDBUS_ITEM_DST_NAME: + case KDBUS_ITEM_PID_COMM: + case KDBUS_ITEM_TID_COMM: + case KDBUS_ITEM_EXE: + case KDBUS_ITEM_CMDLINE: + case KDBUS_ITEM_CGROUP: + case KDBUS_ITEM_SECLABEL: + if (!kdbus_str_valid(item->str, payload_size)) + return -EINVAL; + break; + + case KDBUS_ITEM_CAPS: + if (payload_size < sizeof(u32)) + return -EINVAL; + if (payload_size < sizeof(u32) + + 4 * CAP_TO_INDEX(item->caps.last_cap) * sizeof(u32)) + return -EINVAL; + break; + + case KDBUS_ITEM_AUDIT: + if (payload_size != sizeof(struct kdbus_audit)) + return -EINVAL; + break; + + case KDBUS_ITEM_POLICY_ACCESS: + if (payload_size != sizeof(struct kdbus_policy_access)) + return -EINVAL; + break; + + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_REMOVE: + case KDBUS_ITEM_NAME_CHANGE: + if (payload_size < sizeof(struct kdbus_notify_name_change)) + return -EINVAL; + l = payload_size - offsetof(struct kdbus_notify_name_change, + name); + if (l > 0 && !kdbus_str_valid(item->name_change.name, l)) + return -EINVAL; + break; + + case KDBUS_ITEM_ID_ADD: + case KDBUS_ITEM_ID_REMOVE: + if (payload_size != sizeof(struct kdbus_notify_id_change)) + return -EINVAL; + break; + + case KDBUS_ITEM_REPLY_TIMEOUT: + case KDBUS_ITEM_REPLY_DEAD: + if (payload_size != 0) + return -EINVAL; + break; + + default: + break; + } + + return 0; +} + +/** + * kdbus_items_validate() - validate items passed by user-space + * @items: items to validate + * @items_size: number of items + * + * This verifies that the passed items pointer is consistent and valid. + * Furthermore, each item is checked for: + * - valid "size" value + * - payload is of expected type + * - payload is fully included in the item + * - string payloads are zero-terminated + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_items_validate(const struct kdbus_item *items, size_t items_size) +{ + const struct kdbus_item *item; + int ret; + + KDBUS_ITEMS_FOREACH(item, items, items_size) { + if (!KDBUS_ITEM_VALID(item, items, items_size)) + return -EINVAL; + + ret = kdbus_item_validate(item); + if (ret < 0) + return ret; + } + + if (!KDBUS_ITEMS_END(item, items, items_size)) + return -EINVAL; + + return 0; +} + +/** + * kdbus_item_set() - Set item content + * @item: The item to modify + * @type: The item type to set (KDBUS_ITEM_*) + * @data: Data to copy to item->data, may be %NULL + * @len: Number of bytes in @data + * + * This sets type, size and data fields of an item. If @data is NULL, the data + * memory is cleared. + * + * Note that you must align your @data memory to 8 bytes. Trailing padding (in + * case @len is not 8byte aligned) is cleared by this call. + * + * Returns: Pointer to the following item. + */ +struct kdbus_item *kdbus_item_set(struct kdbus_item *item, u64 type, + const void *data, size_t len) +{ + item->type = type; + item->size = KDBUS_ITEM_HEADER_SIZE + len; + + if (data) { + memcpy(item->data, data, len); + memset(item->data + len, 0, KDBUS_ALIGN8(len) - len); + } else { + memset(item->data, 0, KDBUS_ALIGN8(len)); + } + + return KDBUS_ITEM_NEXT(item); +} diff -Nur linux-4.2/ipc/kdbus/item.h linux-4.2-pck/ipc/kdbus/item.h --- linux-4.2/ipc/kdbus/item.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/item.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_ITEM_H +#define __KDBUS_ITEM_H + +#include +#include + +#include "util.h" + +/* generic access and iterators over a stream of items */ +#define KDBUS_ITEM_NEXT(_i) (typeof(_i))((u8 *)(_i) + KDBUS_ALIGN8((_i)->size)) +#define KDBUS_ITEMS_SIZE(_h, _is) ((_h)->size - offsetof(typeof(*(_h)), _is)) +#define KDBUS_ITEM_HEADER_SIZE offsetof(struct kdbus_item, data) +#define KDBUS_ITEM_SIZE(_s) KDBUS_ALIGN8(KDBUS_ITEM_HEADER_SIZE + (_s)) +#define KDBUS_ITEM_PAYLOAD_SIZE(_i) ((_i)->size - KDBUS_ITEM_HEADER_SIZE) + +#define KDBUS_ITEMS_FOREACH(_i, _is, _s) \ + for ((_i) = (_is); \ + ((u8 *)(_i) < (u8 *)(_is) + (_s)) && \ + ((u8 *)(_i) >= (u8 *)(_is)); \ + (_i) = KDBUS_ITEM_NEXT(_i)) + +#define KDBUS_ITEM_VALID(_i, _is, _s) \ + ((_i)->size >= KDBUS_ITEM_HEADER_SIZE && \ + (u8 *)(_i) + (_i)->size > (u8 *)(_i) && \ + (u8 *)(_i) + (_i)->size <= (u8 *)(_is) + (_s) && \ + (u8 *)(_i) >= (u8 *)(_is)) + +#define KDBUS_ITEMS_END(_i, _is, _s) \ + ((u8 *)(_i) == ((u8 *)(_is) + KDBUS_ALIGN8(_s))) + +/** + * struct kdbus_item_header - Describes the fix part of an item + * @size: The total size of the item + * @type: The item type, one of KDBUS_ITEM_* + */ +struct kdbus_item_header { + u64 size; + u64 type; +}; + +int kdbus_item_validate_name(const struct kdbus_item *item); +int kdbus_item_validate(const struct kdbus_item *item); +int kdbus_items_validate(const struct kdbus_item *items, size_t items_size); +struct kdbus_item *kdbus_item_set(struct kdbus_item *item, u64 type, + const void *data, size_t len); + +#endif diff -Nur linux-4.2/ipc/kdbus/limits.h linux-4.2-pck/ipc/kdbus/limits.h --- linux-4.2/ipc/kdbus/limits.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/limits.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_DEFAULTS_H +#define __KDBUS_DEFAULTS_H + +#include + +/* maximum size of message header and items */ +#define KDBUS_MSG_MAX_SIZE SZ_8K + +/* maximum number of memfd items per message */ +#define KDBUS_MSG_MAX_MEMFD_ITEMS 16 + +/* max size of ioctl command data */ +#define KDBUS_CMD_MAX_SIZE SZ_32K + +/* maximum number of inflight fds in a target queue per user */ +#define KDBUS_CONN_MAX_FDS_PER_USER 16 + +/* maximum message payload size */ +#define KDBUS_MSG_MAX_PAYLOAD_VEC_SIZE SZ_2M + +/* maximum size of bloom bit field in bytes */ +#define KDBUS_BUS_BLOOM_MAX_SIZE SZ_4K + +/* maximum length of well-known bus name */ +#define KDBUS_NAME_MAX_LEN 255 + +/* maximum length of bus, domain, ep name */ +#define KDBUS_SYSNAME_MAX_LEN 63 + +/* maximum number of matches per connection */ +#define KDBUS_MATCH_MAX 4096 + +/* maximum number of queued messages from the same individual user */ +#define KDBUS_CONN_MAX_MSGS 256 + +/* maximum number of well-known names per connection */ +#define KDBUS_CONN_MAX_NAMES 256 + +/* maximum number of queued requests waiting for a reply */ +#define KDBUS_CONN_MAX_REQUESTS_PENDING 128 + +/* maximum number of connections per user in one domain */ +#define KDBUS_USER_MAX_CONN 1024 + +/* maximum number of buses per user in one domain */ +#define KDBUS_USER_MAX_BUSES 16 + +#endif diff -Nur linux-4.2/ipc/kdbus/main.c linux-4.2-pck/ipc/kdbus/main.c --- linux-4.2/ipc/kdbus/main.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/main.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include + +#include "util.h" +#include "fs.h" +#include "handle.h" +#include "metadata.h" +#include "node.h" + +/* + * This is a simplified outline of the internal kdbus object relations, for + * those interested in the inner life of the driver implementation. + * + * From a mount point's (domain's) perspective: + * + * struct kdbus_domain + * |» struct kdbus_user *user (many, owned) + * '» struct kdbus_node node (embedded) + * |» struct kdbus_node children (many, referenced) + * |» struct kdbus_node *parent (pinned) + * '» struct kdbus_bus (many, pinned) + * |» struct kdbus_node node (embedded) + * '» struct kdbus_ep (many, pinned) + * |» struct kdbus_node node (embedded) + * |» struct kdbus_bus *bus (pinned) + * |» struct kdbus_conn conn_list (many, pinned) + * | |» struct kdbus_ep *ep (pinned) + * | |» struct kdbus_name_entry *activator_of (owned) + * | |» struct kdbus_match_db *match_db (owned) + * | |» struct kdbus_meta *meta (owned) + * | |» struct kdbus_match_db *match_db (owned) + * | | '» struct kdbus_match_entry (many, owned) + * | | + * | |» struct kdbus_pool *pool (owned) + * | | '» struct kdbus_pool_slice *slices (many, owned) + * | | '» struct kdbus_pool *pool (pinned) + * | | + * | |» struct kdbus_user *user (pinned) + * | `» struct kdbus_queue_entry entries (many, embedded) + * | |» struct kdbus_pool_slice *slice (pinned) + * | |» struct kdbus_conn_reply *reply (owned) + * | '» struct kdbus_user *user (pinned) + * | + * '» struct kdbus_user *user (pinned) + * '» struct kdbus_policy_db policy_db (embedded) + * |» struct kdbus_policy_db_entry (many, owned) + * | |» struct kdbus_conn (pinned) + * | '» struct kdbus_ep (pinned) + * | + * '» struct kdbus_policy_db_cache_entry (many, owned) + * '» struct kdbus_conn (pinned) + * + * For the life-time of a file descriptor derived from calling open() on a file + * inside the mount point: + * + * struct kdbus_handle + * |» struct kdbus_meta *meta (owned) + * |» struct kdbus_ep *ep (pinned) + * |» struct kdbus_conn *conn (owned) + * '» struct kdbus_ep *ep (owned) + */ + +static int __init kdbus_init(void) +{ + int ret; + + ret = sysfs_create_mount_point(fs_kobj, KBUILD_MODNAME); + if (ret) + return ret; + + ret = kdbus_fs_init(); + if (ret < 0) { + pr_err("cannot register filesystem: %d\n", ret); + goto exit_dir; + } + + pr_info("initialized\n"); + return 0; + +exit_dir: + sysfs_remove_mount_point(fs_kobj, KBUILD_MODNAME); + return ret; +} + +static void __exit kdbus_exit(void) +{ + kdbus_fs_exit(); + sysfs_remove_mount_point(fs_kobj, KBUILD_MODNAME); + ida_destroy(&kdbus_node_ida); +} + +module_init(kdbus_init); +module_exit(kdbus_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("D-Bus, powerful, easy to use interprocess communication"); +MODULE_ALIAS_FS(KBUILD_MODNAME "fs"); diff -Nur linux-4.2/ipc/kdbus/match.c linux-4.2-pck/ipc/kdbus/match.c --- linux-4.2/ipc/kdbus/match.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/match.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,546 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "handle.h" +#include "item.h" +#include "match.h" +#include "message.h" +#include "names.h" + +/** + * struct kdbus_match_db - message filters + * @entries_list: List of matches + * @mdb_rwlock: Match data lock + * @entries_count: Number of entries in database + */ +struct kdbus_match_db { + struct list_head entries_list; + struct rw_semaphore mdb_rwlock; + unsigned int entries_count; +}; + +/** + * struct kdbus_match_entry - a match database entry + * @cookie: User-supplied cookie to lookup the entry + * @list_entry: The list entry element for the db list + * @rules_list: The list head for tracking rules of this entry + */ +struct kdbus_match_entry { + u64 cookie; + struct list_head list_entry; + struct list_head rules_list; +}; + +/** + * struct kdbus_bloom_mask - mask to match against filter + * @generations: Number of generations carried + * @data: Array of bloom bit fields + */ +struct kdbus_bloom_mask { + u64 generations; + u64 *data; +}; + +/** + * struct kdbus_match_rule - a rule appended to a match entry + * @type: An item type to match against + * @bloom_mask: Bloom mask to match a message's filter against, used + * with KDBUS_ITEM_BLOOM_MASK + * @name: Name to match against, used with KDBUS_ITEM_NAME, + * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE} + * @old_id: ID to match against, used with + * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE}, + * KDBUS_ITEM_ID_REMOVE + * @new_id: ID to match against, used with + * KDBUS_ITEM_NAME_{ADD,REMOVE,CHANGE}, + * KDBUS_ITEM_ID_REMOVE + * @src_id: ID to match against, used with KDBUS_ITEM_ID + * @dst_id: Message destination ID, used with KDBUS_ITEM_DST_ID + * @rules_entry: Entry in the entry's rules list + */ +struct kdbus_match_rule { + u64 type; + union { + struct kdbus_bloom_mask bloom_mask; + struct { + char *name; + u64 old_id; + u64 new_id; + }; + u64 src_id; + u64 dst_id; + }; + struct list_head rules_entry; +}; + +static void kdbus_match_rule_free(struct kdbus_match_rule *rule) +{ + if (!rule) + return; + + switch (rule->type) { + case KDBUS_ITEM_BLOOM_MASK: + kfree(rule->bloom_mask.data); + break; + + case KDBUS_ITEM_NAME: + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_REMOVE: + case KDBUS_ITEM_NAME_CHANGE: + kfree(rule->name); + break; + + case KDBUS_ITEM_ID: + case KDBUS_ITEM_DST_ID: + case KDBUS_ITEM_ID_ADD: + case KDBUS_ITEM_ID_REMOVE: + break; + + default: + BUG(); + } + + list_del(&rule->rules_entry); + kfree(rule); +} + +static void kdbus_match_entry_free(struct kdbus_match_entry *entry) +{ + struct kdbus_match_rule *r, *tmp; + + if (!entry) + return; + + list_for_each_entry_safe(r, tmp, &entry->rules_list, rules_entry) + kdbus_match_rule_free(r); + + list_del(&entry->list_entry); + kfree(entry); +} + +/** + * kdbus_match_db_free() - free match db resources + * @mdb: The match database + */ +void kdbus_match_db_free(struct kdbus_match_db *mdb) +{ + struct kdbus_match_entry *entry, *tmp; + + if (!mdb) + return; + + list_for_each_entry_safe(entry, tmp, &mdb->entries_list, list_entry) + kdbus_match_entry_free(entry); + + kfree(mdb); +} + +/** + * kdbus_match_db_new() - create a new match database + * + * Return: a new kdbus_match_db on success, ERR_PTR on failure. + */ +struct kdbus_match_db *kdbus_match_db_new(void) +{ + struct kdbus_match_db *d; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return ERR_PTR(-ENOMEM); + + init_rwsem(&d->mdb_rwlock); + INIT_LIST_HEAD(&d->entries_list); + + return d; +} + +static bool kdbus_match_bloom(const struct kdbus_bloom_filter *filter, + const struct kdbus_bloom_mask *mask, + const struct kdbus_conn *conn) +{ + size_t n = conn->ep->bus->bloom.size / sizeof(u64); + const u64 *m; + size_t i; + + /* + * The message's filter carries a generation identifier, the + * match's mask possibly carries an array of multiple generations + * of the mask. Select the mask with the closest match of the + * filter's generation. + */ + m = mask->data + (min(filter->generation, mask->generations - 1) * n); + + /* + * The message's filter contains the messages properties, + * the match's mask contains the properties to look for in the + * message. Check the mask bit field against the filter bit field, + * if the message possibly carries the properties the connection + * has subscribed to. + */ + for (i = 0; i < n; i++) + if ((filter->data[i] & m[i]) != m[i]) + return false; + + return true; +} + +static bool kdbus_match_rule_conn(const struct kdbus_match_rule *r, + struct kdbus_conn *c, + const struct kdbus_staging *s) +{ + lockdep_assert_held(&c->ep->bus->name_registry->rwlock); + + switch (r->type) { + case KDBUS_ITEM_BLOOM_MASK: + return kdbus_match_bloom(s->bloom_filter, &r->bloom_mask, c); + case KDBUS_ITEM_ID: + return r->src_id == c->id || r->src_id == KDBUS_MATCH_ID_ANY; + case KDBUS_ITEM_DST_ID: + return r->dst_id == s->msg->dst_id || + r->dst_id == KDBUS_MATCH_ID_ANY; + case KDBUS_ITEM_NAME: + return kdbus_conn_has_name(c, r->name); + default: + return false; + } +} + +static bool kdbus_match_rule_kernel(const struct kdbus_match_rule *r, + const struct kdbus_staging *s) +{ + struct kdbus_item *n = s->notify; + + if (WARN_ON(!n) || n->type != r->type) + return false; + + switch (r->type) { + case KDBUS_ITEM_ID_ADD: + return r->new_id == KDBUS_MATCH_ID_ANY || + r->new_id == n->id_change.id; + case KDBUS_ITEM_ID_REMOVE: + return r->old_id == KDBUS_MATCH_ID_ANY || + r->old_id == n->id_change.id; + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_CHANGE: + case KDBUS_ITEM_NAME_REMOVE: + return (r->old_id == KDBUS_MATCH_ID_ANY || + r->old_id == n->name_change.old_id.id) && + (r->new_id == KDBUS_MATCH_ID_ANY || + r->new_id == n->name_change.new_id.id) && + (!r->name || !strcmp(r->name, n->name_change.name)); + default: + return false; + } +} + +static bool kdbus_match_rules(const struct kdbus_match_entry *entry, + struct kdbus_conn *c, + const struct kdbus_staging *s) +{ + struct kdbus_match_rule *r; + + list_for_each_entry(r, &entry->rules_list, rules_entry) + if ((c && !kdbus_match_rule_conn(r, c, s)) || + (!c && !kdbus_match_rule_kernel(r, s))) + return false; + + return true; +} + +/** + * kdbus_match_db_match_msg() - match a msg object agains the database entries + * @mdb: The match database + * @conn_src: The connection object originating the message + * @staging: Staging object containing the message to match against + * + * This function will walk through all the database entries previously uploaded + * with kdbus_match_db_add(). As soon as any of them has an all-satisfied rule + * set, this function will return true. + * + * The caller must hold the registry lock of conn_src->ep->bus, in case conn_src + * is non-NULL. + * + * Return: true if there was a matching database entry, false otherwise. + */ +bool kdbus_match_db_match_msg(struct kdbus_match_db *mdb, + struct kdbus_conn *conn_src, + const struct kdbus_staging *staging) +{ + struct kdbus_match_entry *entry; + bool matched = false; + + down_read(&mdb->mdb_rwlock); + list_for_each_entry(entry, &mdb->entries_list, list_entry) { + matched = kdbus_match_rules(entry, conn_src, staging); + if (matched) + break; + } + up_read(&mdb->mdb_rwlock); + + return matched; +} + +static int kdbus_match_db_remove_unlocked(struct kdbus_match_db *mdb, + u64 cookie) +{ + struct kdbus_match_entry *entry, *tmp; + bool found = false; + + list_for_each_entry_safe(entry, tmp, &mdb->entries_list, list_entry) + if (entry->cookie == cookie) { + kdbus_match_entry_free(entry); + --mdb->entries_count; + found = true; + } + + return found ? 0 : -EBADSLT; +} + +/** + * kdbus_cmd_match_add() - handle KDBUS_CMD_MATCH_ADD + * @conn: connection to operate on + * @argp: command payload + * + * One call to this function (or one ioctl(KDBUS_CMD_MATCH_ADD), respectively, + * adds one new database entry with n rules attached to it. Each rule is + * described with an kdbus_item, and an entry is considered matching if all + * its rules are satisfied. + * + * The items attached to a kdbus_cmd_match struct have the following mapping: + * + * KDBUS_ITEM_BLOOM_MASK: A bloom mask + * KDBUS_ITEM_NAME: A connection's source name + * KDBUS_ITEM_ID: A connection ID + * KDBUS_ITEM_DST_ID: A connection ID + * KDBUS_ITEM_NAME_ADD: + * KDBUS_ITEM_NAME_REMOVE: + * KDBUS_ITEM_NAME_CHANGE: Well-known name changes, carry + * kdbus_notify_name_change + * KDBUS_ITEM_ID_ADD: + * KDBUS_ITEM_ID_REMOVE: Connection ID changes, carry + * kdbus_notify_id_change + * + * For kdbus_notify_{id,name}_change structs, only the ID and name fields + * are looked at when adding an entry. The flags are unused. + * + * Also note that KDBUS_ITEM_BLOOM_MASK, KDBUS_ITEM_NAME, KDBUS_ITEM_ID, + * and KDBUS_ITEM_DST_ID are used to match messages from userspace, while the + * others apply to kernel-generated notifications. + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_match_add(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_match_db *mdb = conn->match_db; + struct kdbus_match_entry *entry = NULL; + struct kdbus_cmd_match *cmd; + struct kdbus_item *item; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_BLOOM_MASK, .multiple = true }, + { .type = KDBUS_ITEM_NAME, .multiple = true }, + { .type = KDBUS_ITEM_ID, .multiple = true }, + { .type = KDBUS_ITEM_DST_ID, .multiple = true }, + { .type = KDBUS_ITEM_NAME_ADD, .multiple = true }, + { .type = KDBUS_ITEM_NAME_REMOVE, .multiple = true }, + { .type = KDBUS_ITEM_NAME_CHANGE, .multiple = true }, + { .type = KDBUS_ITEM_ID_ADD, .multiple = true }, + { .type = KDBUS_ITEM_ID_REMOVE, .multiple = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_MATCH_REPLACE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + ret = -ENOMEM; + goto exit; + } + + entry->cookie = cmd->cookie; + INIT_LIST_HEAD(&entry->list_entry); + INIT_LIST_HEAD(&entry->rules_list); + + KDBUS_ITEMS_FOREACH(item, cmd->items, KDBUS_ITEMS_SIZE(cmd, items)) { + struct kdbus_match_rule *rule; + size_t size = item->size - offsetof(struct kdbus_item, data); + + rule = kzalloc(sizeof(*rule), GFP_KERNEL); + if (!rule) { + ret = -ENOMEM; + goto exit; + } + + rule->type = item->type; + INIT_LIST_HEAD(&rule->rules_entry); + + switch (item->type) { + case KDBUS_ITEM_BLOOM_MASK: { + u64 bsize = conn->ep->bus->bloom.size; + u64 generations; + u64 remainder; + + generations = div64_u64_rem(size, bsize, &remainder); + if (size < bsize || remainder > 0) { + ret = -EDOM; + break; + } + + rule->bloom_mask.data = kmemdup(item->data, + size, GFP_KERNEL); + if (!rule->bloom_mask.data) { + ret = -ENOMEM; + break; + } + + rule->bloom_mask.generations = generations; + break; + } + + case KDBUS_ITEM_NAME: + if (!kdbus_name_is_valid(item->str, false)) { + ret = -EINVAL; + break; + } + + rule->name = kstrdup(item->str, GFP_KERNEL); + if (!rule->name) + ret = -ENOMEM; + + break; + + case KDBUS_ITEM_ID: + rule->src_id = item->id; + break; + + case KDBUS_ITEM_DST_ID: + rule->dst_id = item->id; + break; + + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_REMOVE: + case KDBUS_ITEM_NAME_CHANGE: + rule->old_id = item->name_change.old_id.id; + rule->new_id = item->name_change.new_id.id; + + if (size > sizeof(struct kdbus_notify_name_change)) { + rule->name = kstrdup(item->name_change.name, + GFP_KERNEL); + if (!rule->name) + ret = -ENOMEM; + } + + break; + + case KDBUS_ITEM_ID_ADD: + case KDBUS_ITEM_ID_REMOVE: + if (item->type == KDBUS_ITEM_ID_ADD) + rule->new_id = item->id_change.id; + else + rule->old_id = item->id_change.id; + + break; + } + + if (ret < 0) { + kdbus_match_rule_free(rule); + goto exit; + } + + list_add_tail(&rule->rules_entry, &entry->rules_list); + } + + down_write(&mdb->mdb_rwlock); + + /* Remove any entry that has the same cookie as the current one. */ + if (cmd->flags & KDBUS_MATCH_REPLACE) + kdbus_match_db_remove_unlocked(mdb, entry->cookie); + + /* + * If the above removal caught any entry, there will be room for the + * new one. + */ + if (++mdb->entries_count > KDBUS_MATCH_MAX) { + --mdb->entries_count; + ret = -EMFILE; + } else { + list_add_tail(&entry->list_entry, &mdb->entries_list); + entry = NULL; + } + + up_write(&mdb->mdb_rwlock); + +exit: + kdbus_match_entry_free(entry); + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_match_remove() - handle KDBUS_CMD_MATCH_REMOVE + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_match_remove(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_cmd_match *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + down_write(&conn->match_db->mdb_rwlock); + ret = kdbus_match_db_remove_unlocked(conn->match_db, cmd->cookie); + up_write(&conn->match_db->mdb_rwlock); + + return kdbus_args_clear(&args, ret); +} diff -Nur linux-4.2/ipc/kdbus/match.h linux-4.2-pck/ipc/kdbus/match.h --- linux-4.2/ipc/kdbus/match.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/match.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_MATCH_H +#define __KDBUS_MATCH_H + +struct kdbus_conn; +struct kdbus_match_db; +struct kdbus_staging; + +struct kdbus_match_db *kdbus_match_db_new(void); +void kdbus_match_db_free(struct kdbus_match_db *db); +int kdbus_match_db_add(struct kdbus_conn *conn, + struct kdbus_cmd_match *cmd); +int kdbus_match_db_remove(struct kdbus_conn *conn, + struct kdbus_cmd_match *cmd); +bool kdbus_match_db_match_msg(struct kdbus_match_db *db, + struct kdbus_conn *conn_src, + const struct kdbus_staging *staging); + +int kdbus_cmd_match_add(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_match_remove(struct kdbus_conn *conn, void __user *argp); + +#endif diff -Nur linux-4.2/ipc/kdbus/message.c linux-4.2-pck/ipc/kdbus/message.c --- linux-4.2/ipc/kdbus/message.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/message.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,1040 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "domain.h" +#include "endpoint.h" +#include "handle.h" +#include "item.h" +#include "match.h" +#include "message.h" +#include "names.h" +#include "policy.h" + +static const char * const zeros = "\0\0\0\0\0\0\0"; + +static struct kdbus_gaps *kdbus_gaps_new(size_t n_memfds, size_t n_fds) +{ + size_t size_offsets, size_memfds, size_fds, size; + struct kdbus_gaps *gaps; + + size_offsets = n_memfds * sizeof(*gaps->memfd_offsets); + size_memfds = n_memfds * sizeof(*gaps->memfd_files); + size_fds = n_fds * sizeof(*gaps->fd_files); + size = sizeof(*gaps) + size_offsets + size_memfds + size_fds; + + gaps = kzalloc(size, GFP_KERNEL); + if (!gaps) + return ERR_PTR(-ENOMEM); + + kref_init(&gaps->kref); + gaps->n_memfds = 0; /* we reserve n_memfds, but don't enforce them */ + gaps->memfd_offsets = (void *)(gaps + 1); + gaps->memfd_files = (void *)((u8 *)gaps->memfd_offsets + size_offsets); + gaps->n_fds = 0; /* we reserve n_fds, but don't enforce them */ + gaps->fd_files = (void *)((u8 *)gaps->memfd_files + size_memfds); + + return gaps; +} + +static void kdbus_gaps_free(struct kref *kref) +{ + struct kdbus_gaps *gaps = container_of(kref, struct kdbus_gaps, kref); + size_t i; + + for (i = 0; i < gaps->n_fds; ++i) + if (gaps->fd_files[i]) + fput(gaps->fd_files[i]); + for (i = 0; i < gaps->n_memfds; ++i) + if (gaps->memfd_files[i]) + fput(gaps->memfd_files[i]); + + kfree(gaps); +} + +/** + * kdbus_gaps_ref() - gain reference + * @gaps: gaps object + * + * Return: @gaps is returned + */ +struct kdbus_gaps *kdbus_gaps_ref(struct kdbus_gaps *gaps) +{ + if (gaps) + kref_get(&gaps->kref); + return gaps; +} + +/** + * kdbus_gaps_unref() - drop reference + * @gaps: gaps object + * + * Return: NULL + */ +struct kdbus_gaps *kdbus_gaps_unref(struct kdbus_gaps *gaps) +{ + if (gaps) + kref_put(&gaps->kref, kdbus_gaps_free); + return NULL; +} + +/** + * kdbus_gaps_install() - install file-descriptors + * @gaps: gaps object, or NULL + * @slice: pool slice that contains the message + * @out_incomplete output variable to note incomplete fds + * + * This function installs all file-descriptors of @gaps into the current + * process and copies the file-descriptor numbers into the target pool slice. + * + * If the file-descriptors were only partially installed, then @out_incomplete + * will be set to true. Otherwise, it's set to false. + * + * Return: 0 on success, negative error code on failure + */ +int kdbus_gaps_install(struct kdbus_gaps *gaps, struct kdbus_pool_slice *slice, + bool *out_incomplete) +{ + bool incomplete_fds = false; + struct kvec kvec; + size_t i, n_fds; + int ret, *fds; + + if (!gaps) { + /* nothing to do */ + *out_incomplete = incomplete_fds; + return 0; + } + + n_fds = gaps->n_fds + gaps->n_memfds; + if (n_fds < 1) { + /* nothing to do */ + *out_incomplete = incomplete_fds; + return 0; + } + + fds = kmalloc_array(n_fds, sizeof(*fds), GFP_TEMPORARY); + n_fds = 0; + if (!fds) + return -ENOMEM; + + /* 1) allocate fds and copy them over */ + + if (gaps->n_fds > 0) { + for (i = 0; i < gaps->n_fds; ++i) { + int fd; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + incomplete_fds = true; + + WARN_ON(!gaps->fd_files[i]); + + fds[n_fds++] = fd < 0 ? -1 : fd; + } + + /* + * The file-descriptor array can only be present once per + * message. Hence, prepare all fds and then copy them over with + * a single kvec. + */ + + WARN_ON(!gaps->fd_offset); + + kvec.iov_base = fds; + kvec.iov_len = gaps->n_fds * sizeof(*fds); + ret = kdbus_pool_slice_copy_kvec(slice, gaps->fd_offset, + &kvec, 1, kvec.iov_len); + if (ret < 0) + goto exit; + } + + for (i = 0; i < gaps->n_memfds; ++i) { + int memfd; + + memfd = get_unused_fd_flags(O_CLOEXEC); + if (memfd < 0) { + incomplete_fds = true; + /* memfds are initialized to -1, skip copying it */ + continue; + } + + fds[n_fds++] = memfd; + + /* + * memfds have to be copied individually as they each are put + * into a separate item. This should not be an issue, though, + * as usually there is no need to send more than one memfd per + * message. + */ + + WARN_ON(!gaps->memfd_offsets[i]); + WARN_ON(!gaps->memfd_files[i]); + + kvec.iov_base = &memfd; + kvec.iov_len = sizeof(memfd); + ret = kdbus_pool_slice_copy_kvec(slice, gaps->memfd_offsets[i], + &kvec, 1, kvec.iov_len); + if (ret < 0) + goto exit; + } + + /* 2) install fds now that everything was successful */ + + for (i = 0; i < gaps->n_fds; ++i) + if (fds[i] >= 0) + fd_install(fds[i], get_file(gaps->fd_files[i])); + for (i = 0; i < gaps->n_memfds; ++i) + if (fds[gaps->n_fds + i] >= 0) + fd_install(fds[gaps->n_fds + i], + get_file(gaps->memfd_files[i])); + + ret = 0; + +exit: + if (ret < 0) + for (i = 0; i < n_fds; ++i) + put_unused_fd(fds[i]); + kfree(fds); + *out_incomplete = incomplete_fds; + return ret; +} + +static struct file *kdbus_get_fd(int fd) +{ + struct file *f, *ret; + struct inode *inode; + struct socket *sock; + + if (fd < 0) + return ERR_PTR(-EBADF); + + f = fget_raw(fd); + if (!f) + return ERR_PTR(-EBADF); + + inode = file_inode(f); + sock = S_ISSOCK(inode->i_mode) ? SOCKET_I(inode) : NULL; + + if (f->f_mode & FMODE_PATH) + ret = f; /* O_PATH is always allowed */ + else if (f->f_op == &kdbus_handle_ops) + ret = ERR_PTR(-EOPNOTSUPP); /* disallow kdbus-fd over kdbus */ + else if (sock && sock->sk && sock->ops && sock->ops->family == PF_UNIX) + ret = ERR_PTR(-EOPNOTSUPP); /* disallow UDS over kdbus */ + else + ret = f; /* all other are allowed */ + + if (f != ret) + fput(f); + + return ret; +} + +static struct file *kdbus_get_memfd(const struct kdbus_memfd *memfd) +{ + const int m = F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_SEAL; + struct file *f, *ret; + int s; + + if (memfd->fd < 0) + return ERR_PTR(-EBADF); + + f = fget(memfd->fd); + if (!f) + return ERR_PTR(-EBADF); + + s = shmem_get_seals(f); + if (s < 0) + ret = ERR_PTR(-EMEDIUMTYPE); + else if ((s & m) != m) + ret = ERR_PTR(-ETXTBSY); + else if (memfd->start + memfd->size > (u64)i_size_read(file_inode(f))) + ret = ERR_PTR(-EFAULT); + else + ret = f; + + if (f != ret) + fput(f); + + return ret; +} + +static int kdbus_msg_examine(struct kdbus_msg *msg, struct kdbus_bus *bus, + struct kdbus_cmd_send *cmd, size_t *out_n_memfds, + size_t *out_n_fds, size_t *out_n_parts) +{ + struct kdbus_item *item, *fds = NULL, *bloom = NULL, *dstname = NULL; + u64 n_parts, n_memfds, n_fds, vec_size; + + /* + * Step 1: + * Validate the message and command parameters. + */ + + /* KDBUS_PAYLOAD_KERNEL is reserved to kernel messages */ + if (msg->payload_type == KDBUS_PAYLOAD_KERNEL) + return -EINVAL; + + if (msg->dst_id == KDBUS_DST_ID_BROADCAST) { + /* broadcasts must be marked as signals */ + if (!(msg->flags & KDBUS_MSG_SIGNAL)) + return -EBADMSG; + /* broadcasts cannot have timeouts */ + if (msg->timeout_ns > 0) + return -ENOTUNIQ; + } + + if (msg->flags & KDBUS_MSG_EXPECT_REPLY) { + /* if you expect a reply, you must specify a timeout */ + if (msg->timeout_ns == 0) + return -EINVAL; + /* signals cannot have replies */ + if (msg->flags & KDBUS_MSG_SIGNAL) + return -ENOTUNIQ; + } else { + /* must expect reply if sent as synchronous call */ + if (cmd->flags & KDBUS_SEND_SYNC_REPLY) + return -EINVAL; + /* cannot mark replies as signal */ + if (msg->cookie_reply && (msg->flags & KDBUS_MSG_SIGNAL)) + return -EINVAL; + } + + /* + * Step 2: + * Validate all passed items. While at it, select some statistics that + * are required to allocate state objects later on. + * + * Generic item validation has already been done via + * kdbus_item_validate(). Furthermore, the number of items is naturally + * limited by the maximum message size. Hence, only non-generic item + * checks are performed here (mainly integer overflow tests). + */ + + n_parts = 0; + n_memfds = 0; + n_fds = 0; + vec_size = 0; + + KDBUS_ITEMS_FOREACH(item, msg->items, KDBUS_ITEMS_SIZE(msg, items)) { + switch (item->type) { + case KDBUS_ITEM_PAYLOAD_VEC: { + void __force __user *ptr = KDBUS_PTR(item->vec.address); + u64 size = item->vec.size; + + if (vec_size + size < vec_size) + return -EMSGSIZE; + if (vec_size + size > KDBUS_MSG_MAX_PAYLOAD_VEC_SIZE) + return -EMSGSIZE; + if (ptr && unlikely(!access_ok(VERIFY_READ, ptr, size))) + return -EFAULT; + + if (ptr || size % 8) /* data or padding */ + ++n_parts; + break; + } + case KDBUS_ITEM_PAYLOAD_MEMFD: { + u64 start = item->memfd.start; + u64 size = item->memfd.size; + + if (start + size < start) + return -EMSGSIZE; + if (n_memfds >= KDBUS_MSG_MAX_MEMFD_ITEMS) + return -E2BIG; + + ++n_memfds; + if (size % 8) /* vec-padding required */ + ++n_parts; + break; + } + case KDBUS_ITEM_FDS: { + if (fds) + return -EEXIST; + + fds = item; + n_fds = KDBUS_ITEM_PAYLOAD_SIZE(item) / sizeof(int); + if (n_fds > KDBUS_CONN_MAX_FDS_PER_USER) + return -EMFILE; + + break; + } + case KDBUS_ITEM_BLOOM_FILTER: { + u64 bloom_size; + + if (bloom) + return -EEXIST; + + bloom = item; + bloom_size = KDBUS_ITEM_PAYLOAD_SIZE(item) - + offsetof(struct kdbus_bloom_filter, data); + if (!KDBUS_IS_ALIGNED8(bloom_size)) + return -EFAULT; + if (bloom_size != bus->bloom.size) + return -EDOM; + + break; + } + case KDBUS_ITEM_DST_NAME: { + if (dstname) + return -EEXIST; + + dstname = item; + if (!kdbus_name_is_valid(item->str, false)) + return -EINVAL; + if (msg->dst_id == KDBUS_DST_ID_BROADCAST) + return -EBADMSG; + + break; + } + default: + return -EINVAL; + } + } + + /* + * Step 3: + * Validate that required items were actually passed, and that no item + * contradicts the message flags. + */ + + /* bloom filters must be attached _iff_ it's a signal */ + if (!(msg->flags & KDBUS_MSG_SIGNAL) != !bloom) + return -EBADMSG; + /* destination name is required if no ID is given */ + if (msg->dst_id == KDBUS_DST_ID_NAME && !dstname) + return -EDESTADDRREQ; + /* cannot send file-descriptors attached to broadcasts */ + if (msg->dst_id == KDBUS_DST_ID_BROADCAST && fds) + return -ENOTUNIQ; + + *out_n_memfds = n_memfds; + *out_n_fds = n_fds; + *out_n_parts = n_parts; + + return 0; +} + +static bool kdbus_staging_merge_vecs(struct kdbus_staging *staging, + struct kdbus_item **prev_item, + struct iovec **prev_vec, + const struct kdbus_item *merge) +{ + void __user *ptr = (void __user *)KDBUS_PTR(merge->vec.address); + u64 padding = merge->vec.size % 8; + struct kdbus_item *prev = *prev_item; + struct iovec *vec = *prev_vec; + + /* XXX: merging is disabled so far */ + if (0 && prev && prev->type == KDBUS_ITEM_PAYLOAD_OFF && + !merge->vec.address == !prev->vec.address) { + /* + * If we merge two VECs, we can always drop the second + * PAYLOAD_VEC item. Hence, include its size in the previous + * one. + */ + prev->vec.size += merge->vec.size; + + if (ptr) { + /* + * If we merge two data VECs, we need two iovecs to copy + * the data. But the items can be easily merged by + * summing their lengths. + */ + vec = &staging->parts[staging->n_parts++]; + vec->iov_len = merge->vec.size; + vec->iov_base = ptr; + staging->n_payload += vec->iov_len; + } else if (padding) { + /* + * If we merge two 0-vecs with the second 0-vec + * requiring padding, we need to insert an iovec to copy + * the 0-padding. We try merging it with the previous + * 0-padding iovec. This might end up with an + * iov_len==0, in which case we simply drop the iovec. + */ + if (vec) { + staging->n_payload -= vec->iov_len; + vec->iov_len = prev->vec.size % 8; + if (!vec->iov_len) { + --staging->n_parts; + vec = NULL; + } else { + staging->n_payload += vec->iov_len; + } + } else { + vec = &staging->parts[staging->n_parts++]; + vec->iov_len = padding; + vec->iov_base = (char __user *)zeros; + staging->n_payload += vec->iov_len; + } + } else { + /* + * If we merge two 0-vecs with the second 0-vec having + * no padding, we know the padding of the first stays + * the same. Hence, @vec needs no adjustment. + */ + } + + /* successfully merged with previous item */ + merge = prev; + } else { + /* + * If we cannot merge the payload item with the previous one, + * we simply insert a new iovec for the data/padding. + */ + if (ptr) { + vec = &staging->parts[staging->n_parts++]; + vec->iov_len = merge->vec.size; + vec->iov_base = ptr; + staging->n_payload += vec->iov_len; + } else if (padding) { + vec = &staging->parts[staging->n_parts++]; + vec->iov_len = padding; + vec->iov_base = (char __user *)zeros; + staging->n_payload += vec->iov_len; + } else { + vec = NULL; + } + } + + *prev_item = (struct kdbus_item *)merge; + *prev_vec = vec; + + return merge == prev; +} + +static int kdbus_staging_import(struct kdbus_staging *staging) +{ + struct kdbus_item *it, *item, *last, *prev_payload; + struct kdbus_gaps *gaps = staging->gaps; + struct kdbus_msg *msg = staging->msg; + struct iovec *part, *prev_part; + bool drop_item; + + drop_item = false; + last = NULL; + prev_payload = NULL; + prev_part = NULL; + + /* + * We modify msg->items along the way; make sure to use @item as offset + * to the next item (instead of the iterator @it). + */ + for (it = item = msg->items; + it >= msg->items && + (u8 *)it < (u8 *)msg + msg->size && + (u8 *)it + it->size <= (u8 *)msg + msg->size; ) { + /* + * If we dropped items along the way, move current item to + * front. We must not access @it afterwards, but use @item + * instead! + */ + if (it != item) + memmove(item, it, it->size); + it = (void *)((u8 *)it + KDBUS_ALIGN8(item->size)); + + switch (item->type) { + case KDBUS_ITEM_PAYLOAD_VEC: { + size_t offset = staging->n_payload; + + if (kdbus_staging_merge_vecs(staging, &prev_payload, + &prev_part, item)) { + drop_item = true; + } else if (item->vec.address) { + /* real offset is patched later on */ + item->type = KDBUS_ITEM_PAYLOAD_OFF; + item->vec.offset = offset; + } else { + item->type = KDBUS_ITEM_PAYLOAD_OFF; + item->vec.offset = ~0ULL; + } + + break; + } + case KDBUS_ITEM_PAYLOAD_MEMFD: { + struct file *f; + + f = kdbus_get_memfd(&item->memfd); + if (IS_ERR(f)) + return PTR_ERR(f); + + gaps->memfd_files[gaps->n_memfds] = f; + gaps->memfd_offsets[gaps->n_memfds] = + (u8 *)&item->memfd.fd - (u8 *)msg; + ++gaps->n_memfds; + + /* memfds cannot be merged */ + prev_payload = item; + prev_part = NULL; + + /* insert padding to make following VECs aligned */ + if (item->memfd.size % 8) { + part = &staging->parts[staging->n_parts++]; + part->iov_len = item->memfd.size % 8; + part->iov_base = (char __user *)zeros; + staging->n_payload += part->iov_len; + } + + break; + } + case KDBUS_ITEM_FDS: { + size_t i, n_fds; + + n_fds = KDBUS_ITEM_PAYLOAD_SIZE(item) / sizeof(int); + for (i = 0; i < n_fds; ++i) { + struct file *f; + + f = kdbus_get_fd(item->fds[i]); + if (IS_ERR(f)) + return PTR_ERR(f); + + gaps->fd_files[gaps->n_fds++] = f; + } + + gaps->fd_offset = (u8 *)item->fds - (u8 *)msg; + + break; + } + case KDBUS_ITEM_BLOOM_FILTER: + staging->bloom_filter = &item->bloom_filter; + break; + case KDBUS_ITEM_DST_NAME: + staging->dst_name = item->str; + break; + } + + /* drop item if we merged it with a previous one */ + if (drop_item) { + drop_item = false; + } else { + last = item; + item = KDBUS_ITEM_NEXT(item); + } + } + + /* adjust message size regarding dropped items */ + msg->size = offsetof(struct kdbus_msg, items); + if (last) + msg->size += ((u8 *)last - (u8 *)msg->items) + last->size; + + return 0; +} + +static void kdbus_staging_reserve(struct kdbus_staging *staging) +{ + struct iovec *part; + + part = &staging->parts[staging->n_parts++]; + part->iov_base = (void __user *)zeros; + part->iov_len = 0; +} + +static struct kdbus_staging *kdbus_staging_new(struct kdbus_bus *bus, + size_t n_parts, + size_t msg_extra_size) +{ + const size_t reserved_parts = 5; /* see below for explanation */ + struct kdbus_staging *staging; + int ret; + + n_parts += reserved_parts; + + staging = kzalloc(sizeof(*staging) + n_parts * sizeof(*staging->parts) + + msg_extra_size, GFP_TEMPORARY); + if (!staging) + return ERR_PTR(-ENOMEM); + + staging->msg_seqnum = atomic64_inc_return(&bus->last_message_id); + staging->n_parts = 0; /* we reserve n_parts, but don't enforce them */ + staging->parts = (void *)(staging + 1); + + if (msg_extra_size) /* if requested, allocate message, too */ + staging->msg = (void *)((u8 *)staging->parts + + n_parts * sizeof(*staging->parts)); + + staging->meta_proc = kdbus_meta_proc_new(); + if (IS_ERR(staging->meta_proc)) { + ret = PTR_ERR(staging->meta_proc); + staging->meta_proc = NULL; + goto error; + } + + staging->meta_conn = kdbus_meta_conn_new(); + if (IS_ERR(staging->meta_conn)) { + ret = PTR_ERR(staging->meta_conn); + staging->meta_conn = NULL; + goto error; + } + + /* + * Prepare iovecs to copy the message into the target pool. We use the + * following iovecs: + * * iovec to copy "kdbus_msg.size" + * * iovec to copy "struct kdbus_msg" (minus size) plus items + * * iovec for possible padding after the items + * * iovec for metadata items + * * iovec for possible padding after the items + * + * Make sure to update @reserved_parts if you add more parts here. + */ + + kdbus_staging_reserve(staging); /* msg.size */ + kdbus_staging_reserve(staging); /* msg (minus msg.size) plus items */ + kdbus_staging_reserve(staging); /* msg padding */ + kdbus_staging_reserve(staging); /* meta */ + kdbus_staging_reserve(staging); /* meta padding */ + + return staging; + +error: + kdbus_staging_free(staging); + return ERR_PTR(ret); +} + +struct kdbus_staging *kdbus_staging_new_kernel(struct kdbus_bus *bus, + u64 dst, u64 cookie_timeout, + size_t it_size, size_t it_type) +{ + struct kdbus_staging *staging; + size_t size; + + size = offsetof(struct kdbus_msg, items) + + KDBUS_ITEM_HEADER_SIZE + it_size; + + staging = kdbus_staging_new(bus, 0, KDBUS_ALIGN8(size)); + if (IS_ERR(staging)) + return ERR_CAST(staging); + + staging->msg->size = size; + staging->msg->flags = (dst == KDBUS_DST_ID_BROADCAST) ? + KDBUS_MSG_SIGNAL : 0; + staging->msg->dst_id = dst; + staging->msg->src_id = KDBUS_SRC_ID_KERNEL; + staging->msg->payload_type = KDBUS_PAYLOAD_KERNEL; + staging->msg->cookie_reply = cookie_timeout; + staging->notify = staging->msg->items; + staging->notify->size = KDBUS_ITEM_HEADER_SIZE + it_size; + staging->notify->type = it_type; + + return staging; +} + +struct kdbus_staging *kdbus_staging_new_user(struct kdbus_bus *bus, + struct kdbus_cmd_send *cmd, + struct kdbus_msg *msg) +{ + const size_t reserved_parts = 1; /* see below for explanation */ + size_t n_memfds, n_fds, n_parts; + struct kdbus_staging *staging; + int ret; + + /* + * Examine user-supplied message and figure out how many resources we + * need to allocate in our staging area. This requires us to iterate + * the message twice, but saves us from re-allocating our resources + * all the time. + */ + + ret = kdbus_msg_examine(msg, bus, cmd, &n_memfds, &n_fds, &n_parts); + if (ret < 0) + return ERR_PTR(ret); + + n_parts += reserved_parts; + + /* + * Allocate staging area with the number of required resources. Make + * sure that we have enough iovecs for all required parts pre-allocated + * so this will hopefully be the only memory allocation for this + * message transaction. + */ + + staging = kdbus_staging_new(bus, n_parts, 0); + if (IS_ERR(staging)) + return ERR_CAST(staging); + + staging->msg = msg; + + /* + * If the message contains memfds or fd items, we need to remember some + * state so we can fill in the requested information at RECV time. + * File-descriptors cannot be passed at SEND time. Hence, allocate a + * gaps-object to remember that state. That gaps object is linked to + * from the staging area, but will also be linked to from the message + * queue of each peer. Hence, each receiver owns a reference to it, and + * it will later be used to fill the 'gaps' in message that couldn't be + * filled at SEND time. + * Note that the 'gaps' object is read-only once the staging-allocator + * returns. There might be connections receiving a queued message while + * the sender still broadcasts the message to other receivers. + */ + + if (n_memfds > 0 || n_fds > 0) { + staging->gaps = kdbus_gaps_new(n_memfds, n_fds); + if (IS_ERR(staging->gaps)) { + ret = PTR_ERR(staging->gaps); + staging->gaps = NULL; + kdbus_staging_free(staging); + return ERR_PTR(ret); + } + } + + /* + * kdbus_staging_new() already reserves parts for message setup. For + * user-supplied messages, we add the following iovecs: + * ... variable number of iovecs for payload ... + * * final iovec for possible padding of payload + * + * Make sure to update @reserved_parts if you add more parts here. + */ + + ret = kdbus_staging_import(staging); /* payload */ + kdbus_staging_reserve(staging); /* payload padding */ + + if (ret < 0) + goto error; + + return staging; + +error: + kdbus_staging_free(staging); + return ERR_PTR(ret); +} + +struct kdbus_staging *kdbus_staging_free(struct kdbus_staging *staging) +{ + if (!staging) + return NULL; + + kdbus_meta_conn_unref(staging->meta_conn); + kdbus_meta_proc_unref(staging->meta_proc); + kdbus_gaps_unref(staging->gaps); + kfree(staging); + + return NULL; +} + +static int kdbus_staging_collect_metadata(struct kdbus_staging *staging, + struct kdbus_conn *src, + struct kdbus_conn *dst, + u64 *out_attach) +{ + u64 attach; + int ret; + + if (src) + attach = kdbus_meta_msg_mask(src, dst); + else + attach = KDBUS_ATTACH_TIMESTAMP; /* metadata for kernel msgs */ + + if (src && !src->meta_fake) { + ret = kdbus_meta_proc_collect(staging->meta_proc, attach); + if (ret < 0) + return ret; + } + + ret = kdbus_meta_conn_collect(staging->meta_conn, src, + staging->msg_seqnum, attach); + if (ret < 0) + return ret; + + *out_attach = attach; + return 0; +} + +/** + * kdbus_staging_emit() - emit linearized message in target pool + * @staging: staging object to create message from + * @src: sender of the message (or NULL) + * @dst: target connection to allocate message for + * + * This allocates a pool-slice for @dst and copies the message provided by + * @staging into it. The new slice is then returned to the caller for further + * processing. It's not linked into any queue, yet. + * + * Return: Newly allocated slice or ERR_PTR on failure. + */ +struct kdbus_pool_slice *kdbus_staging_emit(struct kdbus_staging *staging, + struct kdbus_conn *src, + struct kdbus_conn *dst) +{ + struct kdbus_item *item, *meta_items = NULL; + struct kdbus_pool_slice *slice = NULL; + size_t off, size, meta_size; + struct iovec *v; + u64 attach, msg_size; + int ret; + + /* + * Step 1: + * Collect metadata from @src depending on the attach-flags allowed for + * @dst. Translate it into the namespaces pinned by @dst. + */ + + ret = kdbus_staging_collect_metadata(staging, src, dst, &attach); + if (ret < 0) + goto error; + + ret = kdbus_meta_emit(staging->meta_proc, NULL, staging->meta_conn, + dst, attach, &meta_items, &meta_size); + if (ret < 0) + goto error; + + /* + * Step 2: + * Setup iovecs for the message. See kdbus_staging_new() for allocation + * of those iovecs. All reserved iovecs have been initialized with + * iov_len=0 + iov_base=zeros. Furthermore, the iovecs to copy the + * actual message payload have already been initialized and need not be + * touched. + */ + + v = staging->parts; + msg_size = staging->msg->size; + + /* msg.size */ + v->iov_len = sizeof(msg_size); + v->iov_base = (void __user *)&msg_size; + ++v; + + /* msg (after msg.size) plus items */ + v->iov_len = staging->msg->size - sizeof(staging->msg->size); + v->iov_base = (void __user *)((u8 *)staging->msg + + sizeof(staging->msg->size)); + ++v; + + /* padding after msg */ + v->iov_len = KDBUS_ALIGN8(staging->msg->size) - staging->msg->size; + v->iov_base = (void __user *)zeros; + ++v; + + if (meta_size > 0) { + /* metadata items */ + v->iov_len = meta_size; + v->iov_base = (void __user *)meta_items; + ++v; + + /* padding after metadata */ + v->iov_len = KDBUS_ALIGN8(meta_size) - meta_size; + v->iov_base = (void __user *)zeros; + ++v; + + msg_size = KDBUS_ALIGN8(msg_size) + meta_size; + } else { + /* metadata items */ + v->iov_len = 0; + v->iov_base = (void __user *)zeros; + ++v; + + /* padding after metadata */ + v->iov_len = 0; + v->iov_base = (void __user *)zeros; + ++v; + } + + /* ... payload iovecs are already filled in ... */ + + /* compute overall size and fill in padding after payload */ + size = KDBUS_ALIGN8(msg_size); + + if (staging->n_payload > 0) { + size += staging->n_payload; + + v = &staging->parts[staging->n_parts - 1]; + v->iov_len = KDBUS_ALIGN8(size) - size; + v->iov_base = (void __user *)zeros; + + size = KDBUS_ALIGN8(size); + } + + /* + * Step 3: + * The PAYLOAD_OFF items in the message contain a relative 'offset' + * field that tells the receiver where to find the actual payload. This + * offset is relative to the start of the message, and as such depends + * on the size of the metadata items we inserted. This size is variable + * and changes for each peer we send the message to. Hence, we remember + * the last relative offset that was used to calculate the 'offset' + * fields. For each message, we re-calculate it and patch all items, in + * case it changed. + */ + + off = KDBUS_ALIGN8(msg_size); + + if (off != staging->i_payload) { + KDBUS_ITEMS_FOREACH(item, staging->msg->items, + KDBUS_ITEMS_SIZE(staging->msg, items)) { + if (item->type != KDBUS_ITEM_PAYLOAD_OFF) + continue; + + item->vec.offset -= staging->i_payload; + item->vec.offset += off; + } + + staging->i_payload = off; + } + + /* + * Step 4: + * Allocate pool slice and copy over all data. Make sure to properly + * account on user quota. + */ + + ret = kdbus_conn_quota_inc(dst, src ? src->user : NULL, size, + staging->gaps ? staging->gaps->n_fds : 0); + if (ret < 0) + goto error; + + slice = kdbus_pool_slice_alloc(dst->pool, size, true); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto error; + } + + WARN_ON(kdbus_pool_slice_size(slice) != size); + + ret = kdbus_pool_slice_copy_iovec(slice, 0, staging->parts, + staging->n_parts, size); + if (ret < 0) + goto error; + + /* all done, return slice to caller */ + goto exit; + +error: + if (slice) + kdbus_conn_quota_dec(dst, src ? src->user : NULL, size, + staging->gaps ? staging->gaps->n_fds : 0); + kdbus_pool_slice_release(slice); + slice = ERR_PTR(ret); +exit: + kfree(meta_items); + return slice; +} diff -Nur linux-4.2/ipc/kdbus/message.h linux-4.2-pck/ipc/kdbus/message.h --- linux-4.2/ipc/kdbus/message.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/message.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_MESSAGE_H +#define __KDBUS_MESSAGE_H + +#include +#include +#include + +struct kdbus_bus; +struct kdbus_conn; +struct kdbus_meta_conn; +struct kdbus_meta_proc; +struct kdbus_pool_slice; + +/** + * struct kdbus_gaps - gaps in message to be filled later + * @kref: Reference counter + * @n_memfd_offs: Number of memfds + * @memfd_offs: Offsets of kdbus_memfd items in target slice + * @n_fds: Number of fds + * @fds: Array of sent fds + * @fds_offset: Offset of fd-array in target slice + * + * The 'gaps' object is used to track data that is needed to fill gaps in a + * message at RECV time. Usually, we try to compile the whole message at SEND + * time. This has the advantage, that we don't have to cache any information and + * can keep the memory consumption small. Furthermore, all copy operations can + * be combined into a single function call, which speeds up transactions + * considerably. + * However, things like file-descriptors can only be fully installed at RECV + * time. The gaps object tracks this data and pins it until a message is + * received. The gaps object is shared between all receivers of the same + * message. + */ +struct kdbus_gaps { + struct kref kref; + + /* state tracking for KDBUS_ITEM_PAYLOAD_MEMFD entries */ + size_t n_memfds; + u64 *memfd_offsets; + struct file **memfd_files; + + /* state tracking for KDBUS_ITEM_FDS */ + size_t n_fds; + struct file **fd_files; + u64 fd_offset; +}; + +struct kdbus_gaps *kdbus_gaps_ref(struct kdbus_gaps *gaps); +struct kdbus_gaps *kdbus_gaps_unref(struct kdbus_gaps *gaps); +int kdbus_gaps_install(struct kdbus_gaps *gaps, struct kdbus_pool_slice *slice, + bool *out_incomplete); + +/** + * struct kdbus_staging - staging area to import messages + * @msg: User-supplied message + * @gaps: Gaps-object created during import (or NULL if empty) + * @msg_seqnum: Message sequence number + * @notify_entry: Entry into list of kernel-generated notifications + * @i_payload: Current relative index of start of payload + * @n_payload: Total number of bytes needed for payload + * @n_parts: Number of parts + * @parts: Array of iovecs that make up the whole message + * @meta_proc: Process metadata of the sender (or NULL if empty) + * @meta_conn: Connection metadata of the sender (or NULL if empty) + * @bloom_filter: Pointer to the bloom-item in @msg, or NULL + * @dst_name: Pointer to the dst-name-item in @msg, or NULL + * @notify: Pointer to the notification item in @msg, or NULL + * + * The kdbus_staging object is a temporary staging area to import user-supplied + * messages into the kernel. It is only used during SEND and dropped once the + * message is queued. Any data that cannot be collected during SEND, is + * collected in a kdbus_gaps object and attached to the message queue. + */ +struct kdbus_staging { + struct kdbus_msg *msg; + struct kdbus_gaps *gaps; + u64 msg_seqnum; + struct list_head notify_entry; + + /* crafted iovecs to copy the message */ + size_t i_payload; + size_t n_payload; + size_t n_parts; + struct iovec *parts; + + /* metadata state */ + struct kdbus_meta_proc *meta_proc; + struct kdbus_meta_conn *meta_conn; + + /* cached pointers into @msg */ + const struct kdbus_bloom_filter *bloom_filter; + const char *dst_name; + struct kdbus_item *notify; +}; + +struct kdbus_staging *kdbus_staging_new_kernel(struct kdbus_bus *bus, + u64 dst, u64 cookie_timeout, + size_t it_size, size_t it_type); +struct kdbus_staging *kdbus_staging_new_user(struct kdbus_bus *bus, + struct kdbus_cmd_send *cmd, + struct kdbus_msg *msg); +struct kdbus_staging *kdbus_staging_free(struct kdbus_staging *staging); +struct kdbus_pool_slice *kdbus_staging_emit(struct kdbus_staging *staging, + struct kdbus_conn *src, + struct kdbus_conn *dst); + +#endif diff -Nur linux-4.2/ipc/kdbus/metadata.c linux-4.2-pck/ipc/kdbus/metadata.c --- linux-4.2/ipc/kdbus/metadata.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/metadata.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,1347 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "item.h" +#include "message.h" +#include "metadata.h" +#include "names.h" + +/** + * struct kdbus_meta_proc - Process metadata + * @kref: Reference counting + * @lock: Object lock + * @collected: Bitmask of collected items + * @valid: Bitmask of collected and valid items + * @cred: Credentials + * @pid: PID of process + * @tgid: TGID of process + * @ppid: PPID of process + * @tid_comm: TID comm line + * @pid_comm: PID comm line + * @exe_path: Executable path + * @root_path: Root-FS path + * @cmdline: Command-line + * @cgroup: Full cgroup path + * @seclabel: Seclabel + * @audit_loginuid: Audit login-UID + * @audit_sessionid: Audit session-ID + */ +struct kdbus_meta_proc { + struct kref kref; + struct mutex lock; + u64 collected; + u64 valid; + + /* KDBUS_ITEM_CREDS */ + /* KDBUS_ITEM_AUXGROUPS */ + /* KDBUS_ITEM_CAPS */ + const struct cred *cred; + + /* KDBUS_ITEM_PIDS */ + struct pid *pid; + struct pid *tgid; + struct pid *ppid; + + /* KDBUS_ITEM_TID_COMM */ + char tid_comm[TASK_COMM_LEN]; + /* KDBUS_ITEM_PID_COMM */ + char pid_comm[TASK_COMM_LEN]; + + /* KDBUS_ITEM_EXE */ + struct path exe_path; + struct path root_path; + + /* KDBUS_ITEM_CMDLINE */ + char *cmdline; + + /* KDBUS_ITEM_CGROUP */ + char *cgroup; + + /* KDBUS_ITEM_SECLABEL */ + char *seclabel; + + /* KDBUS_ITEM_AUDIT */ + kuid_t audit_loginuid; + unsigned int audit_sessionid; +}; + +/** + * struct kdbus_meta_conn + * @kref: Reference counting + * @lock: Object lock + * @collected: Bitmask of collected items + * @valid: Bitmask of collected and valid items + * @ts: Timestamp values + * @owned_names_items: Serialized items for owned names + * @owned_names_size: Size of @owned_names_items + * @conn_description: Connection description + */ +struct kdbus_meta_conn { + struct kref kref; + struct mutex lock; + u64 collected; + u64 valid; + + /* KDBUS_ITEM_TIMESTAMP */ + struct kdbus_timestamp ts; + + /* KDBUS_ITEM_OWNED_NAME */ + struct kdbus_item *owned_names_items; + size_t owned_names_size; + + /* KDBUS_ITEM_CONN_DESCRIPTION */ + char *conn_description; +}; + +/* fixed size equivalent of "kdbus_caps" */ +struct kdbus_meta_caps { + u32 last_cap; + struct { + u32 caps[_KERNEL_CAPABILITY_U32S]; + } set[4]; +}; + +/** + * kdbus_meta_proc_new() - Create process metadata object + * + * Return: Pointer to new object on success, ERR_PTR on failure. + */ +struct kdbus_meta_proc *kdbus_meta_proc_new(void) +{ + struct kdbus_meta_proc *mp; + + mp = kzalloc(sizeof(*mp), GFP_KERNEL); + if (!mp) + return ERR_PTR(-ENOMEM); + + kref_init(&mp->kref); + mutex_init(&mp->lock); + + return mp; +} + +static void kdbus_meta_proc_free(struct kref *kref) +{ + struct kdbus_meta_proc *mp = container_of(kref, struct kdbus_meta_proc, + kref); + + path_put(&mp->exe_path); + path_put(&mp->root_path); + if (mp->cred) + put_cred(mp->cred); + put_pid(mp->ppid); + put_pid(mp->tgid); + put_pid(mp->pid); + + kfree(mp->seclabel); + kfree(mp->cmdline); + kfree(mp->cgroup); + kfree(mp); +} + +/** + * kdbus_meta_proc_ref() - Gain reference + * @mp: Process metadata object + * + * Return: @mp is returned + */ +struct kdbus_meta_proc *kdbus_meta_proc_ref(struct kdbus_meta_proc *mp) +{ + if (mp) + kref_get(&mp->kref); + return mp; +} + +/** + * kdbus_meta_proc_unref() - Drop reference + * @mp: Process metadata object + * + * Return: NULL + */ +struct kdbus_meta_proc *kdbus_meta_proc_unref(struct kdbus_meta_proc *mp) +{ + if (mp) + kref_put(&mp->kref, kdbus_meta_proc_free); + return NULL; +} + +static void kdbus_meta_proc_collect_pids(struct kdbus_meta_proc *mp) +{ + struct task_struct *parent; + + mp->pid = get_pid(task_pid(current)); + mp->tgid = get_pid(task_tgid(current)); + + rcu_read_lock(); + parent = rcu_dereference(current->real_parent); + mp->ppid = get_pid(task_tgid(parent)); + rcu_read_unlock(); + + mp->valid |= KDBUS_ATTACH_PIDS; +} + +static void kdbus_meta_proc_collect_tid_comm(struct kdbus_meta_proc *mp) +{ + get_task_comm(mp->tid_comm, current); + mp->valid |= KDBUS_ATTACH_TID_COMM; +} + +static void kdbus_meta_proc_collect_pid_comm(struct kdbus_meta_proc *mp) +{ + get_task_comm(mp->pid_comm, current->group_leader); + mp->valid |= KDBUS_ATTACH_PID_COMM; +} + +static void kdbus_meta_proc_collect_exe(struct kdbus_meta_proc *mp) +{ + struct file *exe_file; + + rcu_read_lock(); + exe_file = rcu_dereference(current->mm->exe_file); + if (exe_file) { + mp->exe_path = exe_file->f_path; + path_get(&mp->exe_path); + get_fs_root(current->fs, &mp->root_path); + mp->valid |= KDBUS_ATTACH_EXE; + } + rcu_read_unlock(); +} + +static int kdbus_meta_proc_collect_cmdline(struct kdbus_meta_proc *mp) +{ + struct mm_struct *mm = current->mm; + char *cmdline; + + if (!mm->arg_end) + return 0; + + cmdline = strndup_user((const char __user *)mm->arg_start, + mm->arg_end - mm->arg_start); + if (IS_ERR(cmdline)) + return PTR_ERR(cmdline); + + mp->cmdline = cmdline; + mp->valid |= KDBUS_ATTACH_CMDLINE; + + return 0; +} + +static int kdbus_meta_proc_collect_cgroup(struct kdbus_meta_proc *mp) +{ +#ifdef CONFIG_CGROUPS + void *page; + char *s; + + page = (void *)__get_free_page(GFP_TEMPORARY); + if (!page) + return -ENOMEM; + + s = task_cgroup_path(current, page, PAGE_SIZE); + if (s) { + mp->cgroup = kstrdup(s, GFP_KERNEL); + if (!mp->cgroup) { + free_page((unsigned long)page); + return -ENOMEM; + } + } + + free_page((unsigned long)page); + mp->valid |= KDBUS_ATTACH_CGROUP; +#endif + + return 0; +} + +static int kdbus_meta_proc_collect_seclabel(struct kdbus_meta_proc *mp) +{ +#ifdef CONFIG_SECURITY + char *ctx = NULL; + u32 sid, len; + int ret; + + security_task_getsecid(current, &sid); + ret = security_secid_to_secctx(sid, &ctx, &len); + if (ret < 0) { + /* + * EOPNOTSUPP means no security module is active, + * lets skip adding the seclabel then. This effectively + * drops the SECLABEL item. + */ + return (ret == -EOPNOTSUPP) ? 0 : ret; + } + + mp->seclabel = kstrdup(ctx, GFP_KERNEL); + security_release_secctx(ctx, len); + if (!mp->seclabel) + return -ENOMEM; + + mp->valid |= KDBUS_ATTACH_SECLABEL; +#endif + + return 0; +} + +static void kdbus_meta_proc_collect_audit(struct kdbus_meta_proc *mp) +{ +#ifdef CONFIG_AUDITSYSCALL + mp->audit_loginuid = audit_get_loginuid(current); + mp->audit_sessionid = audit_get_sessionid(current); + mp->valid |= KDBUS_ATTACH_AUDIT; +#endif +} + +/** + * kdbus_meta_proc_collect() - Collect process metadata + * @mp: Process metadata object + * @what: Attach flags to collect + * + * This collects process metadata from current and saves it in @mp. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_meta_proc_collect(struct kdbus_meta_proc *mp, u64 what) +{ + int ret; + + if (!mp || !(what & (KDBUS_ATTACH_CREDS | + KDBUS_ATTACH_PIDS | + KDBUS_ATTACH_AUXGROUPS | + KDBUS_ATTACH_TID_COMM | + KDBUS_ATTACH_PID_COMM | + KDBUS_ATTACH_EXE | + KDBUS_ATTACH_CMDLINE | + KDBUS_ATTACH_CGROUP | + KDBUS_ATTACH_CAPS | + KDBUS_ATTACH_SECLABEL | + KDBUS_ATTACH_AUDIT))) + return 0; + + mutex_lock(&mp->lock); + + /* creds, auxgrps and caps share "struct cred" as context */ + { + const u64 m_cred = KDBUS_ATTACH_CREDS | + KDBUS_ATTACH_AUXGROUPS | + KDBUS_ATTACH_CAPS; + + if ((what & m_cred) && !(mp->collected & m_cred)) { + mp->cred = get_current_cred(); + mp->valid |= m_cred; + mp->collected |= m_cred; + } + } + + if ((what & KDBUS_ATTACH_PIDS) && + !(mp->collected & KDBUS_ATTACH_PIDS)) { + kdbus_meta_proc_collect_pids(mp); + mp->collected |= KDBUS_ATTACH_PIDS; + } + + if ((what & KDBUS_ATTACH_TID_COMM) && + !(mp->collected & KDBUS_ATTACH_TID_COMM)) { + kdbus_meta_proc_collect_tid_comm(mp); + mp->collected |= KDBUS_ATTACH_TID_COMM; + } + + if ((what & KDBUS_ATTACH_PID_COMM) && + !(mp->collected & KDBUS_ATTACH_PID_COMM)) { + kdbus_meta_proc_collect_pid_comm(mp); + mp->collected |= KDBUS_ATTACH_PID_COMM; + } + + if ((what & KDBUS_ATTACH_EXE) && + !(mp->collected & KDBUS_ATTACH_EXE)) { + kdbus_meta_proc_collect_exe(mp); + mp->collected |= KDBUS_ATTACH_EXE; + } + + if ((what & KDBUS_ATTACH_CMDLINE) && + !(mp->collected & KDBUS_ATTACH_CMDLINE)) { + ret = kdbus_meta_proc_collect_cmdline(mp); + if (ret < 0) + goto exit_unlock; + mp->collected |= KDBUS_ATTACH_CMDLINE; + } + + if ((what & KDBUS_ATTACH_CGROUP) && + !(mp->collected & KDBUS_ATTACH_CGROUP)) { + ret = kdbus_meta_proc_collect_cgroup(mp); + if (ret < 0) + goto exit_unlock; + mp->collected |= KDBUS_ATTACH_CGROUP; + } + + if ((what & KDBUS_ATTACH_SECLABEL) && + !(mp->collected & KDBUS_ATTACH_SECLABEL)) { + ret = kdbus_meta_proc_collect_seclabel(mp); + if (ret < 0) + goto exit_unlock; + mp->collected |= KDBUS_ATTACH_SECLABEL; + } + + if ((what & KDBUS_ATTACH_AUDIT) && + !(mp->collected & KDBUS_ATTACH_AUDIT)) { + kdbus_meta_proc_collect_audit(mp); + mp->collected |= KDBUS_ATTACH_AUDIT; + } + + ret = 0; + +exit_unlock: + mutex_unlock(&mp->lock); + return ret; +} + +/** + * kdbus_meta_fake_new() - Create fake metadata object + * + * Return: Pointer to new object on success, ERR_PTR on failure. + */ +struct kdbus_meta_fake *kdbus_meta_fake_new(void) +{ + struct kdbus_meta_fake *mf; + + mf = kzalloc(sizeof(*mf), GFP_KERNEL); + if (!mf) + return ERR_PTR(-ENOMEM); + + return mf; +} + +/** + * kdbus_meta_fake_free() - Free fake metadata object + * @mf: Fake metadata object + * + * Return: NULL + */ +struct kdbus_meta_fake *kdbus_meta_fake_free(struct kdbus_meta_fake *mf) +{ + if (mf) { + put_pid(mf->ppid); + put_pid(mf->tgid); + put_pid(mf->pid); + kfree(mf->seclabel); + kfree(mf); + } + + return NULL; +} + +/** + * kdbus_meta_fake_collect() - Fill fake metadata from faked credentials + * @mf: Fake metadata object + * @creds: Creds to set, may be %NULL + * @pids: PIDs to set, may be %NULL + * @seclabel: Seclabel to set, may be %NULL + * + * This function takes information stored in @creds, @pids and @seclabel and + * resolves them to kernel-representations, if possible. This call uses the + * current task's namespaces to resolve the given information. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_meta_fake_collect(struct kdbus_meta_fake *mf, + const struct kdbus_creds *creds, + const struct kdbus_pids *pids, + const char *seclabel) +{ + if (mf->valid) + return -EALREADY; + + if (creds) { + struct user_namespace *ns = current_user_ns(); + + mf->uid = make_kuid(ns, creds->uid); + mf->euid = make_kuid(ns, creds->euid); + mf->suid = make_kuid(ns, creds->suid); + mf->fsuid = make_kuid(ns, creds->fsuid); + + mf->gid = make_kgid(ns, creds->gid); + mf->egid = make_kgid(ns, creds->egid); + mf->sgid = make_kgid(ns, creds->sgid); + mf->fsgid = make_kgid(ns, creds->fsgid); + + if ((creds->uid != (uid_t)-1 && !uid_valid(mf->uid)) || + (creds->euid != (uid_t)-1 && !uid_valid(mf->euid)) || + (creds->suid != (uid_t)-1 && !uid_valid(mf->suid)) || + (creds->fsuid != (uid_t)-1 && !uid_valid(mf->fsuid)) || + (creds->gid != (gid_t)-1 && !gid_valid(mf->gid)) || + (creds->egid != (gid_t)-1 && !gid_valid(mf->egid)) || + (creds->sgid != (gid_t)-1 && !gid_valid(mf->sgid)) || + (creds->fsgid != (gid_t)-1 && !gid_valid(mf->fsgid))) + return -EINVAL; + + mf->valid |= KDBUS_ATTACH_CREDS; + } + + if (pids) { + mf->pid = get_pid(find_vpid(pids->tid)); + mf->tgid = get_pid(find_vpid(pids->pid)); + mf->ppid = get_pid(find_vpid(pids->ppid)); + + if ((pids->tid != 0 && !mf->pid) || + (pids->pid != 0 && !mf->tgid) || + (pids->ppid != 0 && !mf->ppid)) { + put_pid(mf->pid); + put_pid(mf->tgid); + put_pid(mf->ppid); + mf->pid = NULL; + mf->tgid = NULL; + mf->ppid = NULL; + return -EINVAL; + } + + mf->valid |= KDBUS_ATTACH_PIDS; + } + + if (seclabel) { + mf->seclabel = kstrdup(seclabel, GFP_KERNEL); + if (!mf->seclabel) + return -ENOMEM; + + mf->valid |= KDBUS_ATTACH_SECLABEL; + } + + return 0; +} + +/** + * kdbus_meta_conn_new() - Create connection metadata object + * + * Return: Pointer to new object on success, ERR_PTR on failure. + */ +struct kdbus_meta_conn *kdbus_meta_conn_new(void) +{ + struct kdbus_meta_conn *mc; + + mc = kzalloc(sizeof(*mc), GFP_KERNEL); + if (!mc) + return ERR_PTR(-ENOMEM); + + kref_init(&mc->kref); + mutex_init(&mc->lock); + + return mc; +} + +static void kdbus_meta_conn_free(struct kref *kref) +{ + struct kdbus_meta_conn *mc = + container_of(kref, struct kdbus_meta_conn, kref); + + kfree(mc->conn_description); + kfree(mc->owned_names_items); + kfree(mc); +} + +/** + * kdbus_meta_conn_ref() - Gain reference + * @mc: Connection metadata object + */ +struct kdbus_meta_conn *kdbus_meta_conn_ref(struct kdbus_meta_conn *mc) +{ + if (mc) + kref_get(&mc->kref); + return mc; +} + +/** + * kdbus_meta_conn_unref() - Drop reference + * @mc: Connection metadata object + */ +struct kdbus_meta_conn *kdbus_meta_conn_unref(struct kdbus_meta_conn *mc) +{ + if (mc) + kref_put(&mc->kref, kdbus_meta_conn_free); + return NULL; +} + +static void kdbus_meta_conn_collect_timestamp(struct kdbus_meta_conn *mc, + u64 msg_seqnum) +{ + mc->ts.monotonic_ns = ktime_get_ns(); + mc->ts.realtime_ns = ktime_get_real_ns(); + + if (msg_seqnum) + mc->ts.seqnum = msg_seqnum; + + mc->valid |= KDBUS_ATTACH_TIMESTAMP; +} + +static int kdbus_meta_conn_collect_names(struct kdbus_meta_conn *mc, + struct kdbus_conn *conn) +{ + const struct kdbus_name_owner *owner; + struct kdbus_item *item; + size_t slen, size; + + lockdep_assert_held(&conn->ep->bus->name_registry->rwlock); + + size = 0; + /* open-code length calculation to avoid final padding */ + list_for_each_entry(owner, &conn->names_list, conn_entry) + if (!(owner->flags & KDBUS_NAME_IN_QUEUE)) + size = KDBUS_ALIGN8(size) + KDBUS_ITEM_HEADER_SIZE + + sizeof(struct kdbus_name) + + strlen(owner->name->name) + 1; + + if (!size) + return 0; + + /* make sure we include zeroed padding for convenience helpers */ + item = kmalloc(KDBUS_ALIGN8(size), GFP_KERNEL); + if (!item) + return -ENOMEM; + + mc->owned_names_items = item; + mc->owned_names_size = size; + + list_for_each_entry(owner, &conn->names_list, conn_entry) { + if (owner->flags & KDBUS_NAME_IN_QUEUE) + continue; + + slen = strlen(owner->name->name) + 1; + kdbus_item_set(item, KDBUS_ITEM_OWNED_NAME, NULL, + sizeof(struct kdbus_name) + slen); + item->name.flags = owner->flags; + memcpy(item->name.name, owner->name->name, slen); + item = KDBUS_ITEM_NEXT(item); + } + + /* sanity check: the buffer should be completely written now */ + WARN_ON((u8 *)item != + (u8 *)mc->owned_names_items + KDBUS_ALIGN8(size)); + + mc->valid |= KDBUS_ATTACH_NAMES; + return 0; +} + +static int kdbus_meta_conn_collect_description(struct kdbus_meta_conn *mc, + struct kdbus_conn *conn) +{ + if (!conn->description) + return 0; + + mc->conn_description = kstrdup(conn->description, GFP_KERNEL); + if (!mc->conn_description) + return -ENOMEM; + + mc->valid |= KDBUS_ATTACH_CONN_DESCRIPTION; + return 0; +} + +/** + * kdbus_meta_conn_collect() - Collect connection metadata + * @mc: Message metadata object + * @conn: Connection to collect data from + * @msg_seqnum: Sequence number of the message to send + * @what: Attach flags to collect + * + * This collects connection metadata from @msg_seqnum and @conn and saves it + * in @mc. + * + * If KDBUS_ATTACH_NAMES is set in @what and @conn is non-NULL, the caller must + * hold the name-registry read-lock of conn->ep->bus->registry. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_meta_conn_collect(struct kdbus_meta_conn *mc, + struct kdbus_conn *conn, + u64 msg_seqnum, u64 what) +{ + int ret; + + if (!mc || !(what & (KDBUS_ATTACH_TIMESTAMP | + KDBUS_ATTACH_NAMES | + KDBUS_ATTACH_CONN_DESCRIPTION))) + return 0; + + mutex_lock(&mc->lock); + + if (msg_seqnum && (what & KDBUS_ATTACH_TIMESTAMP) && + !(mc->collected & KDBUS_ATTACH_TIMESTAMP)) { + kdbus_meta_conn_collect_timestamp(mc, msg_seqnum); + mc->collected |= KDBUS_ATTACH_TIMESTAMP; + } + + if (conn && (what & KDBUS_ATTACH_NAMES) && + !(mc->collected & KDBUS_ATTACH_NAMES)) { + ret = kdbus_meta_conn_collect_names(mc, conn); + if (ret < 0) + goto exit_unlock; + mc->collected |= KDBUS_ATTACH_NAMES; + } + + if (conn && (what & KDBUS_ATTACH_CONN_DESCRIPTION) && + !(mc->collected & KDBUS_ATTACH_CONN_DESCRIPTION)) { + ret = kdbus_meta_conn_collect_description(mc, conn); + if (ret < 0) + goto exit_unlock; + mc->collected |= KDBUS_ATTACH_CONN_DESCRIPTION; + } + + ret = 0; + +exit_unlock: + mutex_unlock(&mc->lock); + return ret; +} + +static void kdbus_meta_export_caps(struct kdbus_meta_caps *out, + const struct kdbus_meta_proc *mp, + struct user_namespace *user_ns) +{ + struct user_namespace *iter; + const struct cred *cred = mp->cred; + bool parent = false, owner = false; + int i; + + /* + * This translates the effective capabilities of 'cred' into the given + * user-namespace. If the given user-namespace is a child-namespace of + * the user-namespace of 'cred', the mask can be copied verbatim. If + * not, the mask is cleared. + * There's one exception: If 'cred' is the owner of any user-namespace + * in the path between the given user-namespace and the user-namespace + * of 'cred', then it has all effective capabilities set. This means, + * the user who created a user-namespace always has all effective + * capabilities in any child namespaces. Note that this is based on the + * uid of the namespace creator, not the task hierarchy. + */ + for (iter = user_ns; iter; iter = iter->parent) { + if (iter == cred->user_ns) { + parent = true; + break; + } + + if (iter == &init_user_ns) + break; + + if ((iter->parent == cred->user_ns) && + uid_eq(iter->owner, cred->euid)) { + owner = true; + break; + } + } + + out->last_cap = CAP_LAST_CAP; + + CAP_FOR_EACH_U32(i) { + if (parent) { + out->set[0].caps[i] = cred->cap_inheritable.cap[i]; + out->set[1].caps[i] = cred->cap_permitted.cap[i]; + out->set[2].caps[i] = cred->cap_effective.cap[i]; + out->set[3].caps[i] = cred->cap_bset.cap[i]; + } else if (owner) { + out->set[0].caps[i] = 0U; + out->set[1].caps[i] = ~0U; + out->set[2].caps[i] = ~0U; + out->set[3].caps[i] = ~0U; + } else { + out->set[0].caps[i] = 0U; + out->set[1].caps[i] = 0U; + out->set[2].caps[i] = 0U; + out->set[3].caps[i] = 0U; + } + } + + /* clear unused bits */ + for (i = 0; i < 4; i++) + out->set[i].caps[CAP_TO_INDEX(CAP_LAST_CAP)] &= + CAP_LAST_U32_VALID_MASK; +} + +/* This is equivalent to from_kuid_munged(), but maps INVALID_UID to itself */ +static uid_t kdbus_from_kuid_keep(struct user_namespace *ns, kuid_t uid) +{ + return uid_valid(uid) ? from_kuid_munged(ns, uid) : ((uid_t)-1); +} + +/* This is equivalent to from_kgid_munged(), but maps INVALID_GID to itself */ +static gid_t kdbus_from_kgid_keep(struct user_namespace *ns, kgid_t gid) +{ + return gid_valid(gid) ? from_kgid_munged(ns, gid) : ((gid_t)-1); +} + +struct kdbus_meta_staging { + const struct kdbus_meta_proc *mp; + const struct kdbus_meta_fake *mf; + const struct kdbus_meta_conn *mc; + const struct kdbus_conn *conn; + u64 mask; + + void *exe; + const char *exe_path; +}; + +static size_t kdbus_meta_measure(struct kdbus_meta_staging *staging) +{ + const struct kdbus_meta_proc *mp = staging->mp; + const struct kdbus_meta_fake *mf = staging->mf; + const struct kdbus_meta_conn *mc = staging->mc; + const u64 mask = staging->mask; + size_t size = 0; + + /* process metadata */ + + if (mf && (mask & KDBUS_ATTACH_CREDS)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_creds)); + else if (mp && (mask & KDBUS_ATTACH_CREDS)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_creds)); + + if (mf && (mask & KDBUS_ATTACH_PIDS)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_pids)); + else if (mp && (mask & KDBUS_ATTACH_PIDS)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_pids)); + + if (mp && (mask & KDBUS_ATTACH_AUXGROUPS)) + size += KDBUS_ITEM_SIZE(mp->cred->group_info->ngroups * + sizeof(u64)); + + if (mp && (mask & KDBUS_ATTACH_TID_COMM)) + size += KDBUS_ITEM_SIZE(strlen(mp->tid_comm) + 1); + + if (mp && (mask & KDBUS_ATTACH_PID_COMM)) + size += KDBUS_ITEM_SIZE(strlen(mp->pid_comm) + 1); + + if (staging->exe_path && (mask & KDBUS_ATTACH_EXE)) + size += KDBUS_ITEM_SIZE(strlen(staging->exe_path) + 1); + + if (mp && (mask & KDBUS_ATTACH_CMDLINE)) + size += KDBUS_ITEM_SIZE(strlen(mp->cmdline) + 1); + + if (mp && (mask & KDBUS_ATTACH_CGROUP)) + size += KDBUS_ITEM_SIZE(strlen(mp->cgroup) + 1); + + if (mp && (mask & KDBUS_ATTACH_CAPS)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_meta_caps)); + + if (mf && (mask & KDBUS_ATTACH_SECLABEL)) + size += KDBUS_ITEM_SIZE(strlen(mf->seclabel) + 1); + else if (mp && (mask & KDBUS_ATTACH_SECLABEL)) + size += KDBUS_ITEM_SIZE(strlen(mp->seclabel) + 1); + + if (mp && (mask & KDBUS_ATTACH_AUDIT)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_audit)); + + /* connection metadata */ + + if (mc && (mask & KDBUS_ATTACH_NAMES)) + size += KDBUS_ALIGN8(mc->owned_names_size); + + if (mc && (mask & KDBUS_ATTACH_CONN_DESCRIPTION)) + size += KDBUS_ITEM_SIZE(strlen(mc->conn_description) + 1); + + if (mc && (mask & KDBUS_ATTACH_TIMESTAMP)) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_timestamp)); + + return size; +} + +static struct kdbus_item *kdbus_write_head(struct kdbus_item **iter, + u64 type, u64 size) +{ + struct kdbus_item *item = *iter; + size_t padding; + + item->type = type; + item->size = KDBUS_ITEM_HEADER_SIZE + size; + + /* clear padding */ + padding = KDBUS_ALIGN8(item->size) - item->size; + if (padding) + memset(item->data + size, 0, padding); + + *iter = KDBUS_ITEM_NEXT(item); + return item; +} + +static struct kdbus_item *kdbus_write_full(struct kdbus_item **iter, + u64 type, u64 size, const void *data) +{ + struct kdbus_item *item; + + item = kdbus_write_head(iter, type, size); + memcpy(item->data, data, size); + return item; +} + +static size_t kdbus_meta_write(struct kdbus_meta_staging *staging, void *mem, + size_t size) +{ + struct user_namespace *user_ns = staging->conn->cred->user_ns; + struct pid_namespace *pid_ns = ns_of_pid(staging->conn->pid); + struct kdbus_item *item = NULL, *items = mem; + u8 *end, *owned_names_end = NULL; + + /* process metadata */ + + if (staging->mf && (staging->mask & KDBUS_ATTACH_CREDS)) { + const struct kdbus_meta_fake *mf = staging->mf; + + item = kdbus_write_head(&items, KDBUS_ITEM_CREDS, + sizeof(struct kdbus_creds)); + item->creds = (struct kdbus_creds){ + .uid = kdbus_from_kuid_keep(user_ns, mf->uid), + .euid = kdbus_from_kuid_keep(user_ns, mf->euid), + .suid = kdbus_from_kuid_keep(user_ns, mf->suid), + .fsuid = kdbus_from_kuid_keep(user_ns, mf->fsuid), + .gid = kdbus_from_kgid_keep(user_ns, mf->gid), + .egid = kdbus_from_kgid_keep(user_ns, mf->egid), + .sgid = kdbus_from_kgid_keep(user_ns, mf->sgid), + .fsgid = kdbus_from_kgid_keep(user_ns, mf->fsgid), + }; + } else if (staging->mp && (staging->mask & KDBUS_ATTACH_CREDS)) { + const struct cred *c = staging->mp->cred; + + item = kdbus_write_head(&items, KDBUS_ITEM_CREDS, + sizeof(struct kdbus_creds)); + item->creds = (struct kdbus_creds){ + .uid = kdbus_from_kuid_keep(user_ns, c->uid), + .euid = kdbus_from_kuid_keep(user_ns, c->euid), + .suid = kdbus_from_kuid_keep(user_ns, c->suid), + .fsuid = kdbus_from_kuid_keep(user_ns, c->fsuid), + .gid = kdbus_from_kgid_keep(user_ns, c->gid), + .egid = kdbus_from_kgid_keep(user_ns, c->egid), + .sgid = kdbus_from_kgid_keep(user_ns, c->sgid), + .fsgid = kdbus_from_kgid_keep(user_ns, c->fsgid), + }; + } + + if (staging->mf && (staging->mask & KDBUS_ATTACH_PIDS)) { + item = kdbus_write_head(&items, KDBUS_ITEM_PIDS, + sizeof(struct kdbus_pids)); + item->pids = (struct kdbus_pids){ + .pid = pid_nr_ns(staging->mf->tgid, pid_ns), + .tid = pid_nr_ns(staging->mf->pid, pid_ns), + .ppid = pid_nr_ns(staging->mf->ppid, pid_ns), + }; + } else if (staging->mp && (staging->mask & KDBUS_ATTACH_PIDS)) { + item = kdbus_write_head(&items, KDBUS_ITEM_PIDS, + sizeof(struct kdbus_pids)); + item->pids = (struct kdbus_pids){ + .pid = pid_nr_ns(staging->mp->tgid, pid_ns), + .tid = pid_nr_ns(staging->mp->pid, pid_ns), + .ppid = pid_nr_ns(staging->mp->ppid, pid_ns), + }; + } + + if (staging->mp && (staging->mask & KDBUS_ATTACH_AUXGROUPS)) { + const struct group_info *info = staging->mp->cred->group_info; + size_t i; + + item = kdbus_write_head(&items, KDBUS_ITEM_AUXGROUPS, + info->ngroups * sizeof(u64)); + for (i = 0; i < info->ngroups; ++i) + item->data64[i] = from_kgid_munged(user_ns, + GROUP_AT(info, i)); + } + + if (staging->mp && (staging->mask & KDBUS_ATTACH_TID_COMM)) + item = kdbus_write_full(&items, KDBUS_ITEM_TID_COMM, + strlen(staging->mp->tid_comm) + 1, + staging->mp->tid_comm); + + if (staging->mp && (staging->mask & KDBUS_ATTACH_PID_COMM)) + item = kdbus_write_full(&items, KDBUS_ITEM_PID_COMM, + strlen(staging->mp->pid_comm) + 1, + staging->mp->pid_comm); + + if (staging->exe_path && (staging->mask & KDBUS_ATTACH_EXE)) + item = kdbus_write_full(&items, KDBUS_ITEM_EXE, + strlen(staging->exe_path) + 1, + staging->exe_path); + + if (staging->mp && (staging->mask & KDBUS_ATTACH_CMDLINE)) + item = kdbus_write_full(&items, KDBUS_ITEM_CMDLINE, + strlen(staging->mp->cmdline) + 1, + staging->mp->cmdline); + + if (staging->mp && (staging->mask & KDBUS_ATTACH_CGROUP)) + item = kdbus_write_full(&items, KDBUS_ITEM_CGROUP, + strlen(staging->mp->cgroup) + 1, + staging->mp->cgroup); + + if (staging->mp && (staging->mask & KDBUS_ATTACH_CAPS)) { + item = kdbus_write_head(&items, KDBUS_ITEM_CAPS, + sizeof(struct kdbus_meta_caps)); + kdbus_meta_export_caps((void*)&item->caps, staging->mp, + user_ns); + } + + if (staging->mf && (staging->mask & KDBUS_ATTACH_SECLABEL)) + item = kdbus_write_full(&items, KDBUS_ITEM_SECLABEL, + strlen(staging->mf->seclabel) + 1, + staging->mf->seclabel); + else if (staging->mp && (staging->mask & KDBUS_ATTACH_SECLABEL)) + item = kdbus_write_full(&items, KDBUS_ITEM_SECLABEL, + strlen(staging->mp->seclabel) + 1, + staging->mp->seclabel); + + if (staging->mp && (staging->mask & KDBUS_ATTACH_AUDIT)) { + item = kdbus_write_head(&items, KDBUS_ITEM_AUDIT, + sizeof(struct kdbus_audit)); + item->audit = (struct kdbus_audit){ + .loginuid = from_kuid(user_ns, + staging->mp->audit_loginuid), + .sessionid = staging->mp->audit_sessionid, + }; + } + + /* connection metadata */ + + if (staging->mc && (staging->mask & KDBUS_ATTACH_NAMES)) { + memcpy(items, staging->mc->owned_names_items, + KDBUS_ALIGN8(staging->mc->owned_names_size)); + owned_names_end = (u8 *)items + staging->mc->owned_names_size; + items = (void *)KDBUS_ALIGN8((unsigned long)owned_names_end); + } + + if (staging->mc && (staging->mask & KDBUS_ATTACH_CONN_DESCRIPTION)) + item = kdbus_write_full(&items, KDBUS_ITEM_CONN_DESCRIPTION, + strlen(staging->mc->conn_description) + 1, + staging->mc->conn_description); + + if (staging->mc && (staging->mask & KDBUS_ATTACH_TIMESTAMP)) + item = kdbus_write_full(&items, KDBUS_ITEM_TIMESTAMP, + sizeof(staging->mc->ts), + &staging->mc->ts); + + /* + * Return real size (minus trailing padding). In case of 'owned_names' + * we cannot deduce it from item->size, so treat it special. + */ + + if (items == (void *)KDBUS_ALIGN8((unsigned long)owned_names_end)) + end = owned_names_end; + else if (item) + end = (u8 *)item + item->size; + else + end = mem; + + WARN_ON((u8 *)items - (u8 *)mem != size); + WARN_ON((void *)KDBUS_ALIGN8((unsigned long)end) != (void *)items); + + return end - (u8 *)mem; +} + +int kdbus_meta_emit(struct kdbus_meta_proc *mp, + struct kdbus_meta_fake *mf, + struct kdbus_meta_conn *mc, + struct kdbus_conn *conn, + u64 mask, + struct kdbus_item **out_items, + size_t *out_size) +{ + struct kdbus_meta_staging staging = {}; + struct kdbus_item *items = NULL; + size_t size = 0; + int ret; + + if (WARN_ON(mf && mp)) + mp = NULL; + + staging.mp = mp; + staging.mf = mf; + staging.mc = mc; + staging.conn = conn; + + /* get mask of valid items */ + if (mf) + staging.mask |= mf->valid; + if (mp) { + mutex_lock(&mp->lock); + staging.mask |= mp->valid; + mutex_unlock(&mp->lock); + } + if (mc) { + mutex_lock(&mc->lock); + staging.mask |= mc->valid; + mutex_unlock(&mc->lock); + } + + staging.mask &= mask; + + if (!staging.mask) { /* bail out if nothing to do */ + ret = 0; + goto exit; + } + + /* EXE is special as it needs a temporary page to assemble */ + if (mp && (staging.mask & KDBUS_ATTACH_EXE)) { + struct path p; + + /* + * XXX: We need access to __d_path() so we can write the path + * relative to conn->root_path. Once upstream, we need + * EXPORT_SYMBOL(__d_path) or an equivalent of d_path() that + * takes the root path directly. Until then, we drop this item + * if the root-paths differ. + */ + + get_fs_root(current->fs, &p); + if (path_equal(&p, &conn->root_path)) { + staging.exe = (void *)__get_free_page(GFP_TEMPORARY); + if (!staging.exe) { + path_put(&p); + ret = -ENOMEM; + goto exit; + } + + staging.exe_path = d_path(&mp->exe_path, staging.exe, + PAGE_SIZE); + if (IS_ERR(staging.exe_path)) { + path_put(&p); + ret = PTR_ERR(staging.exe_path); + goto exit; + } + } + path_put(&p); + } + + size = kdbus_meta_measure(&staging); + if (!size) { /* bail out if nothing to do */ + ret = 0; + goto exit; + } + + items = kmalloc(size, GFP_KERNEL); + if (!items) { + ret = -ENOMEM; + goto exit; + } + + size = kdbus_meta_write(&staging, items, size); + if (!size) { + kfree(items); + items = NULL; + } + + ret = 0; + +exit: + if (staging.exe) + free_page((unsigned long)staging.exe); + if (ret >= 0) { + *out_items = items; + *out_size = size; + } + return ret; +} + +enum { + KDBUS_META_PROC_NONE, + KDBUS_META_PROC_NORMAL, +}; + +/** + * kdbus_proc_permission() - check /proc permissions on target pid + * @pid_ns: namespace we operate in + * @cred: credentials of requestor + * @target: target process + * + * This checks whether a process with credentials @cred can access information + * of @target in the namespace @pid_ns. This tries to follow /proc permissions, + * but is slightly more restrictive. + * + * Return: The /proc access level (KDBUS_META_PROC_*) is returned. + */ +static unsigned int kdbus_proc_permission(const struct pid_namespace *pid_ns, + const struct cred *cred, + struct pid *target) +{ + if (pid_ns->hide_pid < 1) + return KDBUS_META_PROC_NORMAL; + + /* XXX: we need groups_search() exported for aux-groups */ + if (gid_eq(cred->egid, pid_ns->pid_gid)) + return KDBUS_META_PROC_NORMAL; + + /* + * XXX: If ptrace_may_access(PTRACE_MODE_READ) is granted, you can + * overwrite hide_pid. However, ptrace_may_access() only supports + * checking 'current', hence, we cannot use this here. But we + * simply decide to not support this override, so no need to worry. + */ + + return KDBUS_META_PROC_NONE; +} + +/** + * kdbus_meta_proc_mask() - calculate which metadata would be visible to + * a connection via /proc + * @prv_pid: pid of metadata provider + * @req_pid: pid of metadata requestor + * @req_cred: credentials of metadata reqeuestor + * @wanted: metadata that is requested + * + * This checks which metadata items of @prv_pid can be read via /proc by the + * requestor @req_pid. + * + * Return: Set of metadata flags the requestor can see (limited by @wanted). + */ +static u64 kdbus_meta_proc_mask(struct pid *prv_pid, + struct pid *req_pid, + const struct cred *req_cred, + u64 wanted) +{ + struct pid_namespace *prv_ns, *req_ns; + unsigned int proc; + + prv_ns = ns_of_pid(prv_pid); + req_ns = ns_of_pid(req_pid); + + /* + * If the sender is not visible in the receiver namespace, then the + * receiver cannot access the sender via its own procfs. Hence, we do + * not attach any additional metadata. + */ + if (!pid_nr_ns(prv_pid, req_ns)) + return 0; + + /* + * If the pid-namespace of the receiver has hide_pid set, it cannot see + * any process but its own. We shortcut this /proc permission check if + * provider and requestor are the same. If not, we perform rather + * expensive /proc permission checks. + */ + if (prv_pid == req_pid) + proc = KDBUS_META_PROC_NORMAL; + else + proc = kdbus_proc_permission(req_ns, req_cred, prv_pid); + + /* you need /proc access to read standard process attributes */ + if (proc < KDBUS_META_PROC_NORMAL) + wanted &= ~(KDBUS_ATTACH_TID_COMM | + KDBUS_ATTACH_PID_COMM | + KDBUS_ATTACH_SECLABEL | + KDBUS_ATTACH_CMDLINE | + KDBUS_ATTACH_CGROUP | + KDBUS_ATTACH_AUDIT | + KDBUS_ATTACH_CAPS | + KDBUS_ATTACH_EXE); + + /* clear all non-/proc flags */ + return wanted & (KDBUS_ATTACH_TID_COMM | + KDBUS_ATTACH_PID_COMM | + KDBUS_ATTACH_SECLABEL | + KDBUS_ATTACH_CMDLINE | + KDBUS_ATTACH_CGROUP | + KDBUS_ATTACH_AUDIT | + KDBUS_ATTACH_CAPS | + KDBUS_ATTACH_EXE); +} + +/** + * kdbus_meta_get_mask() - calculate attach flags mask for metadata request + * @prv_pid: pid of metadata provider + * @prv_mask: mask of metadata the provide grants unchecked + * @req_pid: pid of metadata requestor + * @req_cred: credentials of metadata requestor + * @req_mask: mask of metadata that is requested + * + * This calculates the metadata items that the requestor @req_pid can access + * from the metadata provider @prv_pid. This permission check consists of + * several different parts: + * - Providers can grant metadata items unchecked. Regardless of their type, + * they're always granted to the requestor. This mask is passed as @prv_mask. + * - Basic items (credentials and connection metadata) are granted implicitly + * to everyone. They're publicly available to any bus-user that can see the + * provider. + * - Process credentials that are not granted implicitly follow the same + * permission checks as /proc. This means, we always assume a requestor + * process has access to their *own* /proc mount, if they have access to + * kdbusfs. + * + * Return: Mask of metadata that is granted. + */ +static u64 kdbus_meta_get_mask(struct pid *prv_pid, u64 prv_mask, + struct pid *req_pid, + const struct cred *req_cred, u64 req_mask) +{ + u64 missing, impl_mask, proc_mask = 0; + + /* + * Connection metadata and basic unix process credentials are + * transmitted implicitly, and cannot be suppressed. Both are required + * to perform user-space policies on the receiver-side. Furthermore, + * connection metadata is public state, anyway, and unix credentials + * are needed for UDS-compatibility. We extend them slightly by + * auxiliary groups and additional uids/gids/pids. + */ + impl_mask = /* connection metadata */ + KDBUS_ATTACH_CONN_DESCRIPTION | + KDBUS_ATTACH_TIMESTAMP | + KDBUS_ATTACH_NAMES | + /* credentials and pids */ + KDBUS_ATTACH_AUXGROUPS | + KDBUS_ATTACH_CREDS | + KDBUS_ATTACH_PIDS; + + /* + * Calculate the set of metadata that is not granted implicitly nor by + * the sender, but still requested by the receiver. If any are left, + * perform rather expensive /proc access checks for them. + */ + missing = req_mask & ~((prv_mask | impl_mask) & req_mask); + if (missing) + proc_mask = kdbus_meta_proc_mask(prv_pid, req_pid, req_cred, + missing); + + return (prv_mask | impl_mask | proc_mask) & req_mask; +} + +/** + */ +u64 kdbus_meta_info_mask(const struct kdbus_conn *conn, u64 mask) +{ + return kdbus_meta_get_mask(conn->pid, + atomic64_read(&conn->attach_flags_send), + task_pid(current), + current_cred(), + mask); +} + +/** + */ +u64 kdbus_meta_msg_mask(const struct kdbus_conn *snd, + const struct kdbus_conn *rcv) +{ + return kdbus_meta_get_mask(task_pid(current), + atomic64_read(&snd->attach_flags_send), + rcv->pid, + rcv->cred, + atomic64_read(&rcv->attach_flags_recv)); +} diff -Nur linux-4.2/ipc/kdbus/metadata.h linux-4.2-pck/ipc/kdbus/metadata.h --- linux-4.2/ipc/kdbus/metadata.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/metadata.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_METADATA_H +#define __KDBUS_METADATA_H + +#include + +struct kdbus_conn; +struct kdbus_pool_slice; + +struct kdbus_meta_proc; +struct kdbus_meta_conn; + +/** + * struct kdbus_meta_fake - Fake metadata + * @valid: Bitmask of collected and valid items + * @uid: UID of process + * @euid: EUID of process + * @suid: SUID of process + * @fsuid: FSUID of process + * @gid: GID of process + * @egid: EGID of process + * @sgid: SGID of process + * @fsgid: FSGID of process + * @pid: PID of process + * @tgid: TGID of process + * @ppid: PPID of process + * @seclabel: Seclabel + */ +struct kdbus_meta_fake { + u64 valid; + + /* KDBUS_ITEM_CREDS */ + kuid_t uid, euid, suid, fsuid; + kgid_t gid, egid, sgid, fsgid; + + /* KDBUS_ITEM_PIDS */ + struct pid *pid, *tgid, *ppid; + + /* KDBUS_ITEM_SECLABEL */ + char *seclabel; +}; + +struct kdbus_meta_proc *kdbus_meta_proc_new(void); +struct kdbus_meta_proc *kdbus_meta_proc_ref(struct kdbus_meta_proc *mp); +struct kdbus_meta_proc *kdbus_meta_proc_unref(struct kdbus_meta_proc *mp); +int kdbus_meta_proc_collect(struct kdbus_meta_proc *mp, u64 what); + +struct kdbus_meta_fake *kdbus_meta_fake_new(void); +struct kdbus_meta_fake *kdbus_meta_fake_free(struct kdbus_meta_fake *mf); +int kdbus_meta_fake_collect(struct kdbus_meta_fake *mf, + const struct kdbus_creds *creds, + const struct kdbus_pids *pids, + const char *seclabel); + +struct kdbus_meta_conn *kdbus_meta_conn_new(void); +struct kdbus_meta_conn *kdbus_meta_conn_ref(struct kdbus_meta_conn *mc); +struct kdbus_meta_conn *kdbus_meta_conn_unref(struct kdbus_meta_conn *mc); +int kdbus_meta_conn_collect(struct kdbus_meta_conn *mc, + struct kdbus_conn *conn, + u64 msg_seqnum, u64 what); + +int kdbus_meta_emit(struct kdbus_meta_proc *mp, + struct kdbus_meta_fake *mf, + struct kdbus_meta_conn *mc, + struct kdbus_conn *conn, + u64 mask, + struct kdbus_item **out_items, + size_t *out_size); +u64 kdbus_meta_info_mask(const struct kdbus_conn *conn, u64 mask); +u64 kdbus_meta_msg_mask(const struct kdbus_conn *snd, + const struct kdbus_conn *rcv); + +#endif diff -Nur linux-4.2/ipc/kdbus/names.c linux-4.2-pck/ipc/kdbus/names.c --- linux-4.2/ipc/kdbus/names.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/names.c 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,854 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "handle.h" +#include "item.h" +#include "names.h" +#include "notify.h" +#include "policy.h" + +#define KDBUS_NAME_SAVED_MASK (KDBUS_NAME_ALLOW_REPLACEMENT | \ + KDBUS_NAME_QUEUE) + +static bool kdbus_name_owner_is_used(struct kdbus_name_owner *owner) +{ + return !list_empty(&owner->name_entry) || + owner == owner->name->activator; +} + +static struct kdbus_name_owner * +kdbus_name_owner_new(struct kdbus_conn *conn, struct kdbus_name_entry *name, + u64 flags) +{ + struct kdbus_name_owner *owner; + + kdbus_conn_assert_active(conn); + + if (conn->name_count >= KDBUS_CONN_MAX_NAMES) + return ERR_PTR(-E2BIG); + + owner = kmalloc(sizeof(*owner), GFP_KERNEL); + if (!owner) + return ERR_PTR(-ENOMEM); + + owner->flags = flags & KDBUS_NAME_SAVED_MASK; + owner->conn = conn; + owner->name = name; + list_add_tail(&owner->conn_entry, &conn->names_list); + INIT_LIST_HEAD(&owner->name_entry); + + ++conn->name_count; + return owner; +} + +static void kdbus_name_owner_free(struct kdbus_name_owner *owner) +{ + if (!owner) + return; + + WARN_ON(kdbus_name_owner_is_used(owner)); + --owner->conn->name_count; + list_del(&owner->conn_entry); + kfree(owner); +} + +static struct kdbus_name_owner * +kdbus_name_owner_find(struct kdbus_name_entry *name, struct kdbus_conn *conn) +{ + struct kdbus_name_owner *owner; + + /* + * Use conn->names_list over name->queue to make sure boundaries of + * this linear search are controlled by the connection itself. + * Furthermore, this will find normal owners as well as activators + * without any additional code. + */ + list_for_each_entry(owner, &conn->names_list, conn_entry) + if (owner->name == name) + return owner; + + return NULL; +} + +static bool kdbus_name_entry_is_used(struct kdbus_name_entry *name) +{ + return !list_empty(&name->queue) || name->activator; +} + +static struct kdbus_name_owner * +kdbus_name_entry_first(struct kdbus_name_entry *name) +{ + return list_first_entry_or_null(&name->queue, struct kdbus_name_owner, + name_entry); +} + +static struct kdbus_name_entry * +kdbus_name_entry_new(struct kdbus_name_registry *r, u32 hash, + const char *name_str) +{ + struct kdbus_name_entry *name; + size_t namelen; + + lockdep_assert_held(&r->rwlock); + + namelen = strlen(name_str); + + name = kmalloc(sizeof(*name) + namelen + 1, GFP_KERNEL); + if (!name) + return ERR_PTR(-ENOMEM); + + name->name_id = ++r->name_seq_last; + name->activator = NULL; + INIT_LIST_HEAD(&name->queue); + hash_add(r->entries_hash, &name->hentry, hash); + memcpy(name->name, name_str, namelen + 1); + + return name; +} + +static void kdbus_name_entry_free(struct kdbus_name_entry *name) +{ + if (!name) + return; + + WARN_ON(kdbus_name_entry_is_used(name)); + hash_del(&name->hentry); + kfree(name); +} + +static struct kdbus_name_entry * +kdbus_name_entry_find(struct kdbus_name_registry *r, u32 hash, + const char *name_str) +{ + struct kdbus_name_entry *name; + + lockdep_assert_held(&r->rwlock); + + hash_for_each_possible(r->entries_hash, name, hentry, hash) + if (!strcmp(name->name, name_str)) + return name; + + return NULL; +} + +/** + * kdbus_name_registry_new() - create a new name registry + * + * Return: a new kdbus_name_registry on success, ERR_PTR on failure. + */ +struct kdbus_name_registry *kdbus_name_registry_new(void) +{ + struct kdbus_name_registry *r; + + r = kmalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return ERR_PTR(-ENOMEM); + + hash_init(r->entries_hash); + init_rwsem(&r->rwlock); + r->name_seq_last = 0; + + return r; +} + +/** + * kdbus_name_registry_free() - free name registry + * @r: name registry to free, or NULL + * + * Free a name registry and cleanup all internal objects. This is a no-op if + * you pass NULL as registry. + */ +void kdbus_name_registry_free(struct kdbus_name_registry *r) +{ + if (!r) + return; + + WARN_ON(!hash_empty(r->entries_hash)); + kfree(r); +} + +/** + * kdbus_name_lookup_unlocked() - lookup name in registry + * @reg: name registry + * @name: name to lookup + * + * This looks up @name in the given name-registry and returns the + * kdbus_name_entry object. The caller must hold the registry-lock and must not + * access the returned object after releasing the lock. + * + * Return: Pointer to name-entry, or NULL if not found. + */ +struct kdbus_name_entry * +kdbus_name_lookup_unlocked(struct kdbus_name_registry *reg, const char *name) +{ + return kdbus_name_entry_find(reg, kdbus_strhash(name), name); +} + +static int kdbus_name_become_activator(struct kdbus_name_owner *owner, + u64 *return_flags) +{ + if (kdbus_name_owner_is_used(owner)) + return -EALREADY; + if (owner->name->activator) + return -EEXIST; + + owner->name->activator = owner; + owner->flags |= KDBUS_NAME_ACTIVATOR; + + if (kdbus_name_entry_first(owner->name)) { + owner->flags |= KDBUS_NAME_IN_QUEUE; + } else { + owner->flags |= KDBUS_NAME_PRIMARY; + kdbus_notify_name_change(owner->conn->ep->bus, + KDBUS_ITEM_NAME_ADD, + 0, owner->conn->id, + 0, owner->flags, + owner->name->name); + } + + if (return_flags) + *return_flags = owner->flags | KDBUS_NAME_ACQUIRED; + + return 0; +} + +static int kdbus_name_update(struct kdbus_name_owner *owner, u64 flags, + u64 *return_flags) +{ + struct kdbus_name_owner *primary, *activator; + struct kdbus_name_entry *name; + struct kdbus_bus *bus; + u64 nflags = 0; + int ret = 0; + + name = owner->name; + bus = owner->conn->ep->bus; + primary = kdbus_name_entry_first(name); + activator = name->activator; + + /* cannot be activator and acquire a name */ + if (owner == activator) + return -EUCLEAN; + + /* update saved flags */ + owner->flags = flags & KDBUS_NAME_SAVED_MASK; + + if (!primary) { + /* + * No primary owner (but maybe an activator). Take over the + * name. + */ + + list_add(&owner->name_entry, &name->queue); + owner->flags |= KDBUS_NAME_PRIMARY; + nflags |= KDBUS_NAME_ACQUIRED; + + /* move messages to new owner on activation */ + if (activator) { + kdbus_conn_move_messages(owner->conn, activator->conn, + name->name_id); + kdbus_notify_name_change(bus, KDBUS_ITEM_NAME_CHANGE, + activator->conn->id, owner->conn->id, + activator->flags, owner->flags, + name->name); + activator->flags &= ~KDBUS_NAME_PRIMARY; + activator->flags |= KDBUS_NAME_IN_QUEUE; + } else { + kdbus_notify_name_change(bus, KDBUS_ITEM_NAME_ADD, + 0, owner->conn->id, + 0, owner->flags, + name->name); + } + + } else if (owner == primary) { + /* + * Already the primary owner of the name, flags were already + * updated. Nothing to do. + */ + + owner->flags |= KDBUS_NAME_PRIMARY; + + } else if ((primary->flags & KDBUS_NAME_ALLOW_REPLACEMENT) && + (flags & KDBUS_NAME_REPLACE_EXISTING)) { + /* + * We're not the primary owner but can replace it. Move us + * ahead of the primary owner and acquire the name (possibly + * skipping queued owners ahead of us). + */ + + list_del_init(&owner->name_entry); + list_add(&owner->name_entry, &name->queue); + owner->flags |= KDBUS_NAME_PRIMARY; + nflags |= KDBUS_NAME_ACQUIRED; + + kdbus_notify_name_change(bus, KDBUS_ITEM_NAME_CHANGE, + primary->conn->id, owner->conn->id, + primary->flags, owner->flags, + name->name); + + /* requeue old primary, or drop if queueing not wanted */ + if (primary->flags & KDBUS_NAME_QUEUE) { + primary->flags &= ~KDBUS_NAME_PRIMARY; + primary->flags |= KDBUS_NAME_IN_QUEUE; + } else { + list_del_init(&primary->name_entry); + kdbus_name_owner_free(primary); + } + + } else if (flags & KDBUS_NAME_QUEUE) { + /* + * Name is already occupied and we cannot take it over, but + * queuing is allowed. Put us silently on the queue, if not + * already there. + */ + + owner->flags |= KDBUS_NAME_IN_QUEUE; + if (!kdbus_name_owner_is_used(owner)) { + list_add_tail(&owner->name_entry, &name->queue); + nflags |= KDBUS_NAME_ACQUIRED; + } + } else if (kdbus_name_owner_is_used(owner)) { + /* + * Already queued on name, but re-queueing was not requested. + * Make sure to unlink it from the name, the caller is + * responsible for releasing it. + */ + + list_del_init(&owner->name_entry); + } else { + /* + * Name is already claimed and queueing is not requested. + * Return error to the caller. + */ + + ret = -EEXIST; + } + + if (return_flags) + *return_flags = owner->flags | nflags; + + return ret; +} + +int kdbus_name_acquire(struct kdbus_name_registry *reg, + struct kdbus_conn *conn, const char *name_str, + u64 flags, u64 *return_flags) +{ + struct kdbus_name_entry *name = NULL; + struct kdbus_name_owner *owner = NULL; + u32 hash; + int ret; + + kdbus_conn_assert_active(conn); + + down_write(®->rwlock); + + /* + * Verify the connection has access to the name. Do this before testing + * for double-acquisitions and other errors to make sure we do not leak + * information about this name through possible custom endpoints. + */ + if (!kdbus_conn_policy_own_name(conn, current_cred(), name_str)) { + ret = -EPERM; + goto exit; + } + + /* + * Lookup the name entry. If it already exists, search for an owner + * entry as we might already own that name. If either does not exist, + * we will allocate a fresh one. + */ + hash = kdbus_strhash(name_str); + name = kdbus_name_entry_find(reg, hash, name_str); + if (name) { + owner = kdbus_name_owner_find(name, conn); + } else { + name = kdbus_name_entry_new(reg, hash, name_str); + if (IS_ERR(name)) { + ret = PTR_ERR(name); + name = NULL; + goto exit; + } + } + + /* create name owner object if not already queued */ + if (!owner) { + owner = kdbus_name_owner_new(conn, name, flags); + if (IS_ERR(owner)) { + ret = PTR_ERR(owner); + owner = NULL; + goto exit; + } + } + + if (flags & KDBUS_NAME_ACTIVATOR) + ret = kdbus_name_become_activator(owner, return_flags); + else + ret = kdbus_name_update(owner, flags, return_flags); + if (ret < 0) + goto exit; + +exit: + if (owner && !kdbus_name_owner_is_used(owner)) + kdbus_name_owner_free(owner); + if (name && !kdbus_name_entry_is_used(name)) + kdbus_name_entry_free(name); + up_write(®->rwlock); + kdbus_notify_flush(conn->ep->bus); + return ret; +} + +static void kdbus_name_release_unlocked(struct kdbus_name_owner *owner) +{ + struct kdbus_name_owner *primary, *next; + struct kdbus_name_entry *name; + + name = owner->name; + primary = kdbus_name_entry_first(name); + + list_del_init(&owner->name_entry); + if (owner == name->activator) + name->activator = NULL; + + if (!primary || owner == primary) { + next = kdbus_name_entry_first(name); + if (!next) + next = name->activator; + + if (next) { + /* hand to next in queue */ + next->flags &= ~KDBUS_NAME_IN_QUEUE; + next->flags |= KDBUS_NAME_PRIMARY; + if (next == name->activator) + kdbus_conn_move_messages(next->conn, + owner->conn, + name->name_id); + + kdbus_notify_name_change(owner->conn->ep->bus, + KDBUS_ITEM_NAME_CHANGE, + owner->conn->id, next->conn->id, + owner->flags, next->flags, + name->name); + } else { + kdbus_notify_name_change(owner->conn->ep->bus, + KDBUS_ITEM_NAME_REMOVE, + owner->conn->id, 0, + owner->flags, 0, + name->name); + } + } + + kdbus_name_owner_free(owner); + if (!kdbus_name_entry_is_used(name)) + kdbus_name_entry_free(name); +} + +static int kdbus_name_release(struct kdbus_name_registry *reg, + struct kdbus_conn *conn, + const char *name_str) +{ + struct kdbus_name_owner *owner; + struct kdbus_name_entry *name; + int ret = 0; + + down_write(®->rwlock); + name = kdbus_name_entry_find(reg, kdbus_strhash(name_str), name_str); + if (name) { + owner = kdbus_name_owner_find(name, conn); + if (owner) + kdbus_name_release_unlocked(owner); + else + ret = -EADDRINUSE; + } else { + ret = -ESRCH; + } + up_write(®->rwlock); + + kdbus_notify_flush(conn->ep->bus); + return ret; +} + +/** + * kdbus_name_release_all() - remove all name entries of a given connection + * @reg: name registry + * @conn: connection + */ +void kdbus_name_release_all(struct kdbus_name_registry *reg, + struct kdbus_conn *conn) +{ + struct kdbus_name_owner *owner; + + down_write(®->rwlock); + + while ((owner = list_first_entry_or_null(&conn->names_list, + struct kdbus_name_owner, + conn_entry))) + kdbus_name_release_unlocked(owner); + + up_write(®->rwlock); + + kdbus_notify_flush(conn->ep->bus); +} + +/** + * kdbus_name_is_valid() - check if a name is valid + * @p: The name to check + * @allow_wildcard: Whether or not to allow a wildcard name + * + * A name is valid if all of the following criterias are met: + * + * - The name has two or more elements separated by a period ('.') character. + * - All elements must contain at least one character. + * - Each element must only contain the ASCII characters "[A-Z][a-z][0-9]_-" + * and must not begin with a digit. + * - The name must not exceed KDBUS_NAME_MAX_LEN. + * - If @allow_wildcard is true, the name may end on '.*' + */ +bool kdbus_name_is_valid(const char *p, bool allow_wildcard) +{ + bool dot, found_dot = false; + const char *q; + + for (dot = true, q = p; *q; q++) { + if (*q == '.') { + if (dot) + return false; + + found_dot = true; + dot = true; + } else { + bool good; + + good = isalpha(*q) || (!dot && isdigit(*q)) || + *q == '_' || *q == '-' || + (allow_wildcard && dot && + *q == '*' && *(q + 1) == '\0'); + + if (!good) + return false; + + dot = false; + } + } + + if (q - p > KDBUS_NAME_MAX_LEN) + return false; + + if (dot) + return false; + + if (!found_dot) + return false; + + return true; +} + +/** + * kdbus_cmd_name_acquire() - handle KDBUS_CMD_NAME_ACQUIRE + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_name_acquire(struct kdbus_conn *conn, void __user *argp) +{ + const char *item_name; + struct kdbus_cmd *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_NAME, .mandatory = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_NAME_REPLACE_EXISTING | + KDBUS_NAME_ALLOW_REPLACEMENT | + KDBUS_NAME_QUEUE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + item_name = argv[1].item->str; + if (!kdbus_name_is_valid(item_name, false)) { + ret = -EINVAL; + goto exit; + } + + ret = kdbus_name_acquire(conn->ep->bus->name_registry, conn, item_name, + cmd->flags, &cmd->return_flags); + +exit: + return kdbus_args_clear(&args, ret); +} + +/** + * kdbus_cmd_name_release() - handle KDBUS_CMD_NAME_RELEASE + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_name_release(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_cmd *cmd; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + { .type = KDBUS_ITEM_NAME, .mandatory = true }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + if (!kdbus_conn_is_ordinary(conn)) + return -EOPNOTSUPP; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + ret = kdbus_name_release(conn->ep->bus->name_registry, conn, + argv[1].item->str); + return kdbus_args_clear(&args, ret); +} + +static int kdbus_list_write(struct kdbus_conn *conn, + struct kdbus_conn *c, + struct kdbus_pool_slice *slice, + size_t *pos, + struct kdbus_name_owner *o, + bool write) +{ + struct kvec kvec[4]; + size_t cnt = 0; + int ret; + + /* info header */ + struct kdbus_info info = { + .size = 0, + .id = c->id, + .flags = c->flags, + }; + + /* fake the header of a kdbus_name item */ + struct { + u64 size; + u64 type; + u64 flags; + } h = {}; + + if (o && !kdbus_conn_policy_see_name_unlocked(conn, current_cred(), + o->name->name)) + return 0; + + kdbus_kvec_set(&kvec[cnt++], &info, sizeof(info), &info.size); + + /* append name */ + if (o) { + size_t slen = strlen(o->name->name) + 1; + + h.size = offsetof(struct kdbus_item, name.name) + slen; + h.type = KDBUS_ITEM_OWNED_NAME; + h.flags = o->flags; + + kdbus_kvec_set(&kvec[cnt++], &h, sizeof(h), &info.size); + kdbus_kvec_set(&kvec[cnt++], o->name->name, slen, &info.size); + cnt += !!kdbus_kvec_pad(&kvec[cnt], &info.size); + } + + if (write) { + ret = kdbus_pool_slice_copy_kvec(slice, *pos, kvec, + cnt, info.size); + if (ret < 0) + return ret; + } + + *pos += info.size; + return 0; +} + +static int kdbus_list_all(struct kdbus_conn *conn, u64 flags, + struct kdbus_pool_slice *slice, + size_t *pos, bool write) +{ + struct kdbus_conn *c; + size_t p = *pos; + int ret, i; + + hash_for_each(conn->ep->bus->conn_hash, i, c, hentry) { + bool added = false; + + /* skip monitors */ + if (kdbus_conn_is_monitor(c)) + continue; + + /* all names the connection owns */ + if (flags & (KDBUS_LIST_NAMES | + KDBUS_LIST_ACTIVATORS | + KDBUS_LIST_QUEUED)) { + struct kdbus_name_owner *o; + + list_for_each_entry(o, &c->names_list, conn_entry) { + if (o->flags & KDBUS_NAME_ACTIVATOR) { + if (!(flags & KDBUS_LIST_ACTIVATORS)) + continue; + + ret = kdbus_list_write(conn, c, slice, + &p, o, write); + if (ret < 0) { + mutex_unlock(&c->lock); + return ret; + } + + added = true; + } else if (o->flags & KDBUS_NAME_IN_QUEUE) { + if (!(flags & KDBUS_LIST_QUEUED)) + continue; + + ret = kdbus_list_write(conn, c, slice, + &p, o, write); + if (ret < 0) { + mutex_unlock(&c->lock); + return ret; + } + + added = true; + } else if (flags & KDBUS_LIST_NAMES) { + ret = kdbus_list_write(conn, c, slice, + &p, o, write); + if (ret < 0) { + mutex_unlock(&c->lock); + return ret; + } + + added = true; + } + } + } + + /* nothing added so far, just add the unique ID */ + if (!added && (flags & KDBUS_LIST_UNIQUE)) { + ret = kdbus_list_write(conn, c, slice, &p, NULL, write); + if (ret < 0) + return ret; + } + } + + *pos = p; + return 0; +} + +/** + * kdbus_cmd_list() - handle KDBUS_CMD_LIST + * @conn: connection to operate on + * @argp: command payload + * + * Return: >=0 on success, negative error code on failure. + */ +int kdbus_cmd_list(struct kdbus_conn *conn, void __user *argp) +{ + struct kdbus_name_registry *reg = conn->ep->bus->name_registry; + struct kdbus_pool_slice *slice = NULL; + struct kdbus_cmd_list *cmd; + size_t pos, size; + int ret; + + struct kdbus_arg argv[] = { + { .type = KDBUS_ITEM_NEGOTIATE }, + }; + struct kdbus_args args = { + .allowed_flags = KDBUS_FLAG_NEGOTIATE | + KDBUS_LIST_UNIQUE | + KDBUS_LIST_NAMES | + KDBUS_LIST_ACTIVATORS | + KDBUS_LIST_QUEUED, + .argv = argv, + .argc = ARRAY_SIZE(argv), + }; + + ret = kdbus_args_parse(&args, argp, &cmd); + if (ret != 0) + return ret; + + /* lock order: domain -> bus -> ep -> names -> conn */ + down_read(®->rwlock); + down_read(&conn->ep->bus->conn_rwlock); + down_read(&conn->ep->policy_db.entries_rwlock); + + /* size of records */ + size = 0; + ret = kdbus_list_all(conn, cmd->flags, NULL, &size, false); + if (ret < 0) + goto exit_unlock; + + if (size == 0) { + kdbus_pool_publish_empty(conn->pool, &cmd->offset, + &cmd->list_size); + } else { + slice = kdbus_pool_slice_alloc(conn->pool, size, false); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto exit_unlock; + } + + /* copy the records */ + pos = 0; + ret = kdbus_list_all(conn, cmd->flags, slice, &pos, true); + if (ret < 0) + goto exit_unlock; + + WARN_ON(pos != size); + kdbus_pool_slice_publish(slice, &cmd->offset, &cmd->list_size); + } + + if (kdbus_member_set_user(&cmd->offset, argp, typeof(*cmd), offset) || + kdbus_member_set_user(&cmd->list_size, argp, + typeof(*cmd), list_size)) + ret = -EFAULT; + +exit_unlock: + up_read(&conn->ep->policy_db.entries_rwlock); + up_read(&conn->ep->bus->conn_rwlock); + up_read(®->rwlock); + kdbus_pool_slice_release(slice); + return kdbus_args_clear(&args, ret); +} diff -Nur linux-4.2/ipc/kdbus/names.h linux-4.2-pck/ipc/kdbus/names.h --- linux-4.2/ipc/kdbus/names.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/names.h 2015-09-08 00:48:22.235030231 -0300 @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_NAMES_H +#define __KDBUS_NAMES_H + +#include +#include + +struct kdbus_name_entry; +struct kdbus_name_owner; +struct kdbus_name_registry; + +/** + * struct kdbus_name_registry - names registered for a bus + * @entries_hash: Map of entries + * @lock: Registry data lock + * @name_seq_last: Last used sequence number to assign to a name entry + */ +struct kdbus_name_registry { + DECLARE_HASHTABLE(entries_hash, 8); + struct rw_semaphore rwlock; + u64 name_seq_last; +}; + +/** + * struct kdbus_name_entry - well-know name entry + * @name_id: sequence number of name entry to be able to uniquely + * identify a name over its registration lifetime + * @activator: activator of this name, or NULL + * @queue: list of queued owners + * @hentry: entry in registry map + * @name: well-known name + */ +struct kdbus_name_entry { + u64 name_id; + struct kdbus_name_owner *activator; + struct list_head queue; + struct hlist_node hentry; + char name[]; +}; + +/** + * struct kdbus_name_owner - owner of a well-known name + * @flags: KDBUS_NAME_* flags of this owner + * @conn: connection owning the name + * @name: name that is owned + * @conn_entry: link into @conn + * @name_entry: link into @name + */ +struct kdbus_name_owner { + u64 flags; + struct kdbus_conn *conn; + struct kdbus_name_entry *name; + struct list_head conn_entry; + struct list_head name_entry; +}; + +bool kdbus_name_is_valid(const char *p, bool allow_wildcard); + +struct kdbus_name_registry *kdbus_name_registry_new(void); +void kdbus_name_registry_free(struct kdbus_name_registry *reg); + +struct kdbus_name_entry * +kdbus_name_lookup_unlocked(struct kdbus_name_registry *reg, const char *name); + +int kdbus_name_acquire(struct kdbus_name_registry *reg, + struct kdbus_conn *conn, const char *name, + u64 flags, u64 *return_flags); +void kdbus_name_release_all(struct kdbus_name_registry *reg, + struct kdbus_conn *conn); + +int kdbus_cmd_name_acquire(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_name_release(struct kdbus_conn *conn, void __user *argp); +int kdbus_cmd_list(struct kdbus_conn *conn, void __user *argp); + +/** + * kdbus_name_get_owner() - get current owner of a name + * @name: name to get current owner of + * + * This returns a pointer to the current owner of a name (or its activator if + * there is no owner). The caller must make sure @name is valid and does not + * vanish. + * + * Return: Pointer to current owner or NULL if there is none. + */ +static inline struct kdbus_name_owner * +kdbus_name_get_owner(struct kdbus_name_entry *name) +{ + return list_first_entry_or_null(&name->queue, struct kdbus_name_owner, + name_entry) ? : name->activator; +} + +#endif diff -Nur linux-4.2/ipc/kdbus/node.c linux-4.2-pck/ipc/kdbus/node.c --- linux-4.2/ipc/kdbus/node.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/node.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,948 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "domain.h" +#include "endpoint.h" +#include "fs.h" +#include "handle.h" +#include "node.h" +#include "util.h" + +/** + * DOC: kdbus nodes + * + * Nodes unify lifetime management across exposed kdbus objects and provide a + * hierarchy. Each kdbus object, that might be exposed to user-space, has a + * kdbus_node object embedded and is linked into the hierarchy. Each node can + * have any number (0-n) of child nodes linked. Each child retains a reference + * to its parent node. For root-nodes, the parent is NULL. + * + * Each node object goes through a bunch of states during it's lifetime: + * * NEW + * * LINKED (can be skipped by NEW->FREED transition) + * * ACTIVE (can be skipped by LINKED->INACTIVE transition) + * * INACTIVE + * * DRAINED + * * FREED + * + * Each node is allocated by the caller and initialized via kdbus_node_init(). + * This never fails and sets the object into state NEW. From now on, ref-counts + * on the node manage its lifetime. During init, the ref-count is set to 1. Once + * it drops to 0, the node goes to state FREED and the node->free_cb() callback + * is called to deallocate any memory. + * + * After initializing a node, you usually link it into the hierarchy. You need + * to provide a parent node and a name. The node will be linked as child to the + * parent and a globally unique ID is assigned to the child. The name of the + * child must be unique for all children of this parent. Otherwise, linking the + * child will fail with -EEXIST. + * Note that the child is not marked active, yet. Admittedly, it prevents any + * other node from being linked with the same name (thus, it reserves that + * name), but any child-lookup (via name or unique ID) will never return this + * child unless it has been marked active. + * + * Once successfully linked, you can use kdbus_node_activate() to activate a + * child. This will mark the child active. This state can be skipped by directly + * deactivating the child via kdbus_node_deactivate() (see below). + * By activating a child, you enable any lookups on this child to succeed from + * now on. Furthermore, any code that got its hands on a reference to the node, + * can from now on "acquire" the node. + * + * Active References (or: 'acquiring' and 'releasing' a node) + * Additionally to normal object references, nodes support something we call + * "active references". An active reference can be acquired via + * kdbus_node_acquire() and released via kdbus_node_release(). A caller + * _must_ own a normal object reference whenever calling those functions. + * Unlike object references, acquiring an active reference can fail (by + * returning 'false' from kdbus_node_acquire()). An active reference can + * only be acquired if the node is marked active. If it is not marked + * active, yet, or if it was already deactivated, no more active references + * can be acquired, ever! + * Active references are used to track tasks working on a node. Whenever a + * task enters kernel-space to perform an action on a node, it acquires an + * active reference, performs the action and releases the reference again. + * While holding an active reference, the node is guaranteed to stay active. + * If the node is deactivated in parallel, the node is marked as + * deactivated, then we wait for all active references to be dropped, before + * we finally proceed with any cleanups. That is, if you hold an active + * reference to a node, any resources that are bound to the "active" state + * are guaranteed to stay accessible until you release your reference. + * + * Active-references are very similar to rw-locks, where acquiring a node is + * equal to try-read-lock and releasing to read-unlock. Deactivating a node + * means write-lock and never releasing it again. + * Unlike rw-locks, the 'active reference' concept is more versatile and + * avoids unusual rw-lock usage (never releasing a write-lock..). + * + * It is safe to acquire multiple active-references recursively. But you + * need to check the return value of kdbus_node_acquire() on _each_ call. It + * may stop granting references at _any_ time. + * + * You're free to perform any operations you want while holding an active + * reference, except sleeping for an indefinite period. Sleeping for a fixed + * amount of time is fine, but you usually should not wait on wait-queues + * without a timeout. + * For example, if you wait for I/O to happen, you should gather all data + * and schedule the I/O operation, then release your active reference and + * wait for it to complete. Then try to acquire a new reference. If it + * fails, perform any cleanup (the node is now dead). Otherwise, you can + * finish your operation. + * + * All nodes can be deactivated via kdbus_node_deactivate() at any time. You can + * call this multiple times, even in parallel or on nodes that were never + * linked, and it will just work. Furthermore, all children will be deactivated + * recursively as well. If a node is deactivated, there might still be active + * references that were acquired before calling kdbus_node_deactivate(). The + * owner of an object must call kdbus_node_drain() (which is a superset of + * kdbus_node_deactivate()) before dropping their reference. This will + * deactivate the node and also synchronously wait for all active references to + * be dropped. Hence, once kdbus_node_drain() returns, the node is fully + * released and no active references exist, anymore. + * kdbus_node_drain() can be called at any times, multiple times, and in + * parallel on multiple threads. All calls are synchronized internally and will + * return only once the node is fully drained. The only restriction is, you + * must not hold an active reference when calling kdbus_node_drain() (unlike + * deactivation, which allows the caller to hold an active reference). + * + * When a node is activated, we acquire a normal object reference to the node. + * This reference is dropped after deactivation is fully done (and only if the + * node really was activated). This allows callers to link+activate a child node + * and then drop all refs. This has the effect that nobody owns a reference to + * the node, except for the parent node. Hence, if the parent is deactivated + * (and thus all children are deactivated, too), this will automatically + * release the child node. + * + * Currently, nodes provide a bunch of resources that external code can use + * directly. This includes: + * + * * node->waitq: Each node has its own wait-queue that is used to manage + * the 'active' state. When a node is deactivated, we wait on + * this queue until all active refs are dropped. Analogously, + * when you release an active reference on a deactivated + * node, and the active ref-count drops to 0, we wake up a + * single thread on this queue. Furthermore, once the + * ->release_cb() callback finished, we wake up all waiters. + * The node-owner is free to re-use this wait-queue for other + * purposes. As node-management uses this queue only during + * deactivation, it is usually totally fine to re-use the + * queue for other, preferably low-overhead, use-cases. + * + * * node->type: This field defines the type of the owner of this node. It + * must be set during node initialization and must remain + * constant. The node management never looks at this value, + * but external users might use to gain access to the owner + * object of a node. + * It is totally up to the owner of the node to define what + * their type means. Usually it means you can access the + * parent structure via container_of(), as long as you hold an + * active reference to the node. + * + * * node->free_cb: callback after all references are dropped + * node->release_cb: callback during node deactivation + * These fields must be set by the node owner during + * node initialization. They must remain constant. If + * NULL, they're skipped. + * + * * node->mode: filesystem access modes + * node->uid: filesystem owner uid + * node->gid: filesystem owner gid + * These fields must be set by the node owner during node + * initialization. They must remain constant and may be + * accessed by other callers to properly initialize + * filesystem nodes. + * + * * node->id: This is an unsigned 32bit integer allocated by an IDA. It is + * always kept as small as possible during allocation and is + * globally unique across all nodes allocated by this module. 0 + * is reserved as "not assigned" and is the default. + * The ID is assigned during kdbus_node_link() and is kept until + * the object is freed. Thus, the ID surpasses the active + * lifetime of a node. As long as you hold an object reference + * to a node (and the node was linked once), the ID is valid and + * unique. + * + * * node->name: name of this node + * node->hash: 31bit hash-value of @name (range [2..INT_MAX-1]) + * These values follow the same lifetime rules as node->id. + * They're initialized when the node is linked and then remain + * constant until the last object reference is dropped. + * Unlike the id, the name is only unique across all siblings + * and only until the node is deactivated. Currently, the name + * is even unique if linked but not activated, yet. This might + * change in the future, though. Code should not rely on this. + * + * * node->lock: lock to protect node->children, node->rb, node->parent + * * node->parent: Reference to parent node. This is set during LINK time + * and is dropped during destruction. You can freely access + * this field, but it may be NULL (root node). + * * node->children: rb-tree of all linked children of this node. You must + * not access this directly, but use one of the iterator + * or lookup helpers. + */ + +/* + * Bias values track states of "active references". They're all negative. If a + * node is active, its active-ref-counter is >=0 and tracks all active + * references. Once a node is deactivaed, we subtract NODE_BIAS. This means, the + * counter is now negative but still counts the active references. Once it drops + * to exactly NODE_BIAS, we know all active references were dropped. Exactly one + * thread will change it to NODE_RELEASE now, perform cleanup and then put it + * into NODE_DRAINED. Once drained, all other threads that tried deactivating + * the node will now be woken up (thus, they wait until the node is fully done). + * The initial state during node-setup is NODE_NEW. If a node is directly + * deactivated without having ever been active, it is put into + * NODE_RELEASE_DIRECT instead of NODE_BIAS. This tracks this one-bit state + * across node-deactivation. The task putting it into NODE_RELEASE now knows + * whether the node was active before or not. + * + * Some archs implement atomic_sub(v) with atomic_add(-v), so reserve INT_MIN + * to avoid overflows if multiplied by -1. + */ +#define KDBUS_NODE_BIAS (INT_MIN + 5) +#define KDBUS_NODE_RELEASE_DIRECT (KDBUS_NODE_BIAS - 1) +#define KDBUS_NODE_RELEASE (KDBUS_NODE_BIAS - 2) +#define KDBUS_NODE_DRAINED (KDBUS_NODE_BIAS - 3) +#define KDBUS_NODE_NEW (KDBUS_NODE_BIAS - 4) + +/* global unique ID mapping for kdbus nodes */ +DEFINE_IDA(kdbus_node_ida); + +/** + * kdbus_node_name_hash() - hash a name + * @name: The string to hash + * + * This computes the hash of @name. It is guaranteed to be in the range + * [2..INT_MAX-1]. The values 1, 2 and INT_MAX are unused as they are reserved + * for the filesystem code. + * + * Return: hash value of the passed string + */ +static unsigned int kdbus_node_name_hash(const char *name) +{ + unsigned int hash; + + /* reserve hash numbers 0, 1 and >=INT_MAX for magic directories */ + hash = kdbus_strhash(name) & INT_MAX; + if (hash < 2) + hash += 2; + if (hash >= INT_MAX) + hash = INT_MAX - 1; + + return hash; +} + +/** + * kdbus_node_name_compare() - compare a name with a node's name + * @hash: hash of the string to compare the node with + * @name: name to compare the node with + * @node: node to compare the name with + * + * This compares a query string against a kdbus node. If the kdbus node has the + * given name, this returns 0. Otherwise, this returns >0 / <0 depending + * whether the query string is greater / less than the node. + * + * Note: If @node is drained but has the name @name, this returns 1. The + * reason for this is that we treat drained nodes as "renamed". The + * slot of such nodes is no longer occupied and new nodes can claim it. + * Obviously, this has the side-effect that you cannot match drained + * nodes, as they will never return 0 on name-matches. But this is + * intentional, as there is no reason why anyone would ever want to match + * on drained nodes. + * + * Return: 0 if @name and @hash exactly match the information in @node, or + * an integer less than or greater than zero if @name is found, respectively, + * to be less than or be greater than the string stored in @node. + */ +static int kdbus_node_name_compare(unsigned int hash, const char *name, + const struct kdbus_node *node) +{ + int ret; + + if (hash != node->hash) + return hash - node->hash; + + ret = strcmp(name, node->name); + if (ret != 0) + return ret; + + return atomic_read(&node->active) == KDBUS_NODE_DRAINED; +} + +/** + * kdbus_node_init() - initialize a kdbus_node + * @node: Pointer to the node to initialize + * @type: The type the node will have (KDBUS_NODE_*) + * + * The caller is responsible of allocating @node and initializating it to zero. + * Once this call returns, you must use the node_ref() and node_unref() + * functions to manage this node. + */ +void kdbus_node_init(struct kdbus_node *node, unsigned int type) +{ + atomic_set(&node->refcnt, 1); + mutex_init(&node->lock); + node->id = 0; + node->type = type; + RB_CLEAR_NODE(&node->rb); + node->children = RB_ROOT; + init_waitqueue_head(&node->waitq); + atomic_set(&node->active, KDBUS_NODE_NEW); +} + +/** + * kdbus_node_link() - link a node into the nodes system + * @node: Pointer to the node to initialize + * @parent: Pointer to a parent node, may be %NULL + * @name: The name of the node (or NULL if root node) + * + * This links a node into the hierarchy. This must not be called multiple times. + * If @parent is NULL, the node becomes a new root node. + * + * This call will fail if @name is not unique across all its siblings or if no + * ID could be allocated. You must not activate a node if linking failed! It is + * safe to deactivate it, though. + * + * Once you linked a node, you must call kdbus_node_drain() before you drop + * the last reference (even if you never activate the node). + * + * Return: 0 on success. negative error otherwise. + */ +int kdbus_node_link(struct kdbus_node *node, struct kdbus_node *parent, + const char *name) +{ + int ret; + + if (WARN_ON(node->type != KDBUS_NODE_DOMAIN && !parent)) + return -EINVAL; + + if (WARN_ON(parent && !name)) + return -EINVAL; + + if (name) { + node->name = kstrdup(name, GFP_KERNEL); + if (!node->name) + return -ENOMEM; + + node->hash = kdbus_node_name_hash(name); + } + + ret = ida_simple_get(&kdbus_node_ida, 1, 0, GFP_KERNEL); + if (ret < 0) + return ret; + + node->id = ret; + ret = 0; + + if (parent) { + struct rb_node **n, *prev; + + if (!kdbus_node_acquire(parent)) + return -ESHUTDOWN; + + mutex_lock(&parent->lock); + + n = &parent->children.rb_node; + prev = NULL; + + while (*n) { + struct kdbus_node *pos; + int result; + + pos = kdbus_node_from_rb(*n); + prev = *n; + result = kdbus_node_name_compare(node->hash, + node->name, + pos); + if (result == 0) { + ret = -EEXIST; + goto exit_unlock; + } + + if (result < 0) + n = &pos->rb.rb_left; + else + n = &pos->rb.rb_right; + } + + /* add new node and rebalance the tree */ + rb_link_node(&node->rb, prev, n); + rb_insert_color(&node->rb, &parent->children); + node->parent = kdbus_node_ref(parent); + +exit_unlock: + mutex_unlock(&parent->lock); + kdbus_node_release(parent); + } + + return ret; +} + +/** + * kdbus_node_ref() - Acquire object reference + * @node: node to acquire reference to (or NULL) + * + * This acquires a new reference to @node. You must already own a reference when + * calling this! + * If @node is NULL, this is a no-op. + * + * Return: @node is returned + */ +struct kdbus_node *kdbus_node_ref(struct kdbus_node *node) +{ + if (node) + atomic_inc(&node->refcnt); + return node; +} + +/** + * kdbus_node_unref() - Drop object reference + * @node: node to drop reference to (or NULL) + * + * This drops an object reference to @node. You must not access the node if you + * no longer own a reference. + * If the ref-count drops to 0, the object will be destroyed (->free_cb will be + * called). + * + * If you linked or activated the node, you must deactivate the node before you + * drop your last reference! If you didn't link or activate the node, you can + * drop any reference you want. + * + * Note that this calls into ->free_cb() and thus _might_ sleep. The ->free_cb() + * callbacks must not acquire any outer locks, though. So you can safely drop + * references while holding locks (apart from node->parent->lock). + * + * If @node is NULL, this is a no-op. + * + * Return: This always returns NULL + */ +struct kdbus_node *kdbus_node_unref(struct kdbus_node *node) +{ + if (node && atomic_dec_and_test(&node->refcnt)) { + struct kdbus_node safe = *node; + + WARN_ON(atomic_read(&node->active) != KDBUS_NODE_DRAINED); + + if (node->parent) { + mutex_lock(&node->parent->lock); + if (!RB_EMPTY_NODE(&node->rb)) { + rb_erase(&node->rb, + &node->parent->children); + RB_CLEAR_NODE(&node->rb); + } + mutex_unlock(&node->parent->lock); + } + + if (node->free_cb) + node->free_cb(node); + if (safe.id > 0) + ida_simple_remove(&kdbus_node_ida, safe.id); + + kfree(safe.name); + kdbus_node_unref(safe.parent); + } + + return NULL; +} + +/** + * kdbus_node_is_active() - test whether a node is active + * @node: node to test + * + * This checks whether @node is active. That means, @node was linked and + * activated by the node owner and hasn't been deactivated, yet. If, and only + * if, a node is active, kdbus_node_acquire() will be able to acquire active + * references. + * + * Note that this function does not give any lifetime guarantees. After this + * call returns, the node might be deactivated immediately. Normally, what you + * want is to acquire a real active reference via kdbus_node_acquire(). + * + * Return: true if @node is active, false otherwise + */ +bool kdbus_node_is_active(struct kdbus_node *node) +{ + return atomic_read(&node->active) >= 0; +} + +/** + * kdbus_node_is_deactivated() - test whether a node was already deactivated + * @node: node to test + * + * This checks whether kdbus_node_deactivate() was called on @node. Note that + * this might be true even if you never deactivated the node directly, but only + * one of its ancestors. + * + * Note that even if this returns 'false', the node might get deactivated + * immediately after the call returns. + * + * Return: true if @node was already deactivated, false if not + */ +bool kdbus_node_is_deactivated(struct kdbus_node *node) +{ + int v; + + v = atomic_read(&node->active); + return v != KDBUS_NODE_NEW && v < 0; +} + +/** + * kdbus_node_activate() - activate a node + * @node: node to activate + * + * This marks @node as active if, and only if, the node wasn't activated nor + * deactivated, yet, and the parent is still active. Any but the first call to + * kdbus_node_activate() is a no-op. + * If you called kdbus_node_deactivate() before, then even the first call to + * kdbus_node_activate() will be a no-op. + * + * This call doesn't give any lifetime guarantees. The node might get + * deactivated immediately after this call returns. Or the parent might already + * be deactivated, which will make this call a no-op. + * + * If this call successfully activated a node, it will take an object reference + * to it. This reference is dropped after the node is deactivated. Therefore, + * the object owner can safely drop their reference to @node iff they know that + * its parent node will get deactivated at some point. Once the parent node is + * deactivated, it will deactivate all its child and thus drop this reference + * again. + * + * Return: True if this call successfully activated the node, otherwise false. + * Note that this might return false, even if the node is still active + * (eg., if you called this a second time). + */ +bool kdbus_node_activate(struct kdbus_node *node) +{ + bool res = false; + + mutex_lock(&node->lock); + if (atomic_read(&node->active) == KDBUS_NODE_NEW) { + atomic_sub(KDBUS_NODE_NEW, &node->active); + /* activated nodes have ref +1 */ + kdbus_node_ref(node); + res = true; + } + mutex_unlock(&node->lock); + + return res; +} + +/** + * kdbus_node_recurse_unlock() - advance iterator on a tree + * @start: node at which the iteration started + * @node: previously visited node + * + * This helper advances an iterator by one, when traversing a node tree. It is + * supposed to be used like this: + * + * struct kdbus_node *n; + * + * n = start; + * while (n) { + * mutex_lock(&n->lock); + * ... visit @n ... + * n = kdbus_node_recurse_unlock(start, n); + * } + * + * This helpers takes as input the start-node of the iteration and the current + * position. It returns a pointer to the next node to visit. The caller must + * hold a reference to @start during the whole iteration. Furthermore, @node + * must be locked when entering this helper. On return, the lock is released. + * + * The order of visit is pre-order traversal. + * + * If @node is deactivated before recursing its children, then it is guaranteed + * that all children will be visited. If @node is still active, new nodes might + * be inserted during traversal, and thus might be missed. + * + * Also note that the node-locks are released while traversing children. You + * must not rely on the locks to be held during the whole traversal. Each node + * that is visited is pinned by this helper, so the caller can rely on owning a + * reference. It is dropped, once all of the children of the node have been + * visited (recursively). + * + * You *must not* bail out of a traversal early, otherwise you'll leak + * ref-counts to all nodes in the current depth-path. + * + * Return: Reference to next node, or NULL. + */ +static struct kdbus_node *kdbus_node_recurse_unlock(struct kdbus_node *start, + struct kdbus_node *node) +{ + struct kdbus_node *t, *prev = NULL; + struct rb_node *rb; + + lockdep_assert_held(&node->lock); + + rb = rb_first(&node->children); + if (!rb) { + do { + mutex_unlock(&node->lock); + kdbus_node_unref(prev); + + if (node == start) + return NULL; + + prev = node; + node = node->parent; + + mutex_lock(&node->lock); + rb = rb_next(&prev->rb); + } while (!rb); + } + + t = kdbus_node_ref(kdbus_node_from_rb(rb)); + mutex_unlock(&node->lock); + kdbus_node_unref(prev); + return t; +} + +/** + * kdbus_node_deactivate() - deactivate a node + * @node: node to deactivate + * + * This recursively deactivates the passed node and all its children. The nodes + * are marked as deactivated, but they're not drained. Hence, even after this + * call returns, there might still be someone holding an active reference to + * any of the nodes. However, no new active references can be acquired after + * this returns. + * + * It is safe to call this multiple times (even in parallel). Each call is + * guaranteed to only return after _all_ nodes have been deactivated. + */ +void kdbus_node_deactivate(struct kdbus_node *node) +{ + struct kdbus_node *pos; + int v; + + pos = node; + while (pos) { + mutex_lock(&pos->lock); + + /* + * Add BIAS to pos->active to mark it as inactive. If it was + * never active before, immediately mark it as RELEASE_INACTIVE + * so that this case can be detected later on. + * If the node was already deactivated, make sure to still + * recurse into the children. Otherwise, we might return before + * a racing thread finished deactivating all children. But we + * want to guarantee that the whole tree is deactivated once + * this returns. + */ + v = atomic_read(&pos->active); + if (v >= 0) + atomic_add_return(KDBUS_NODE_BIAS, &pos->active); + else if (v == KDBUS_NODE_NEW) + atomic_set(&pos->active, KDBUS_NODE_RELEASE_DIRECT); + + pos = kdbus_node_recurse_unlock(node, pos); + } +} + +/** + * kdbus_node_drain() - drain a node + * @node: node to drain + * + * This function recursively deactivates this node and all its children and + * then waits for all active references to be dropped. This function is a + * superset of kdbus_node_deactivate(), as it additionally drains all nodes. It + * returns only once all children and the node itself were recursively drained + * (even if you call this function multiple times in parallel). + * + * It is safe to call this function on _any_ node that was initialized _any_ + * number of times. + * + * This call may sleep, as it waits for all active references to be dropped. + */ +void kdbus_node_drain(struct kdbus_node *node) +{ + struct kdbus_node *pos; + int v; + + kdbus_node_deactivate(node); + + pos = node; + while (pos) { + /* wait until all active references were dropped */ + wait_event(pos->waitq, + atomic_read(&pos->active) <= KDBUS_NODE_BIAS); + + /* mark object as RELEASE */ + mutex_lock(&pos->lock); + v = atomic_read(&pos->active); + if (v == KDBUS_NODE_BIAS || v == KDBUS_NODE_RELEASE_DIRECT) + atomic_set(&pos->active, KDBUS_NODE_RELEASE); + mutex_unlock(&pos->lock); + + /* + * If this is the thread that marked the object as RELEASE, we + * perform the actual release. Otherwise, we wait until the + * release is done and the node is marked as DRAINED. + */ + if (v == KDBUS_NODE_BIAS || v == KDBUS_NODE_RELEASE_DIRECT) { + if (pos->release_cb) + pos->release_cb(pos, v == KDBUS_NODE_BIAS); + + /* mark as DRAINED */ + atomic_set(&pos->active, KDBUS_NODE_DRAINED); + wake_up_all(&pos->waitq); + + /* drop VFS cache */ + kdbus_fs_flush(pos); + + /* + * If the node was activated and someone subtracted BIAS + * from it to deactivate it, we, and only us, are + * responsible to release the extra ref-count that was + * taken once in kdbus_node_activate(). + * If the node was never activated, no-one ever + * subtracted BIAS, but instead skipped that state and + * immediately went to NODE_RELEASE_DIRECT. In that case + * we must not drop the reference. + */ + if (v == KDBUS_NODE_BIAS) + kdbus_node_unref(pos); + } else { + /* wait until object is DRAINED */ + wait_event(pos->waitq, + atomic_read(&pos->active) == KDBUS_NODE_DRAINED); + } + + mutex_lock(&pos->lock); + pos = kdbus_node_recurse_unlock(node, pos); + } +} + +/** + * kdbus_node_acquire() - Acquire an active ref on a node + * @node: The node + * + * This acquires an active-reference to @node. This will only succeed if the + * node is active. You must release this active reference via + * kdbus_node_release() again. + * + * See the introduction to "active references" for more details. + * + * Return: %true if @node was non-NULL and active + */ +bool kdbus_node_acquire(struct kdbus_node *node) +{ + return node && atomic_inc_unless_negative(&node->active); +} + +/** + * kdbus_node_release() - Release an active ref on a node + * @node: The node + * + * This releases an active reference that was previously acquired via + * kdbus_node_acquire(). See kdbus_node_acquire() for details. + */ +void kdbus_node_release(struct kdbus_node *node) +{ + if (node && atomic_dec_return(&node->active) == KDBUS_NODE_BIAS) + wake_up(&node->waitq); +} + +/** + * kdbus_node_find_child() - Find child by name + * @node: parent node to search through + * @name: name of child node + * + * This searches through all children of @node for a child-node with name @name. + * If not found, or if the child is deactivated, NULL is returned. Otherwise, + * the child is acquired and a new reference is returned. + * + * If you're done with the child, you need to release it and drop your + * reference. + * + * This function does not acquire the parent node. However, if the parent was + * already deactivated, then kdbus_node_deactivate() will, at some point, also + * deactivate the child. Therefore, we can rely on the explicit ordering during + * deactivation. + * + * Return: Reference to acquired child node, or NULL if not found / not active. + */ +struct kdbus_node *kdbus_node_find_child(struct kdbus_node *node, + const char *name) +{ + struct kdbus_node *child; + struct rb_node *rb; + unsigned int hash; + int ret; + + hash = kdbus_node_name_hash(name); + + mutex_lock(&node->lock); + rb = node->children.rb_node; + while (rb) { + child = kdbus_node_from_rb(rb); + ret = kdbus_node_name_compare(hash, name, child); + if (ret < 0) + rb = rb->rb_left; + else if (ret > 0) + rb = rb->rb_right; + else + break; + } + if (rb && kdbus_node_acquire(child)) + kdbus_node_ref(child); + else + child = NULL; + mutex_unlock(&node->lock); + + return child; +} + +static struct kdbus_node *node_find_closest_unlocked(struct kdbus_node *node, + unsigned int hash, + const char *name) +{ + struct kdbus_node *n, *pos = NULL; + struct rb_node *rb; + int res; + + /* + * Find the closest child with ``node->hash >= hash'', or, if @name is + * valid, ``node->name >= name'' (where '>=' is the lex. order). + */ + + rb = node->children.rb_node; + while (rb) { + n = kdbus_node_from_rb(rb); + + if (name) + res = kdbus_node_name_compare(hash, name, n); + else + res = hash - n->hash; + + if (res <= 0) { + rb = rb->rb_left; + pos = n; + } else { /* ``hash > n->hash'', ``name > n->name'' */ + rb = rb->rb_right; + } + } + + return pos; +} + +/** + * kdbus_node_find_closest() - Find closest child-match + * @node: parent node to search through + * @hash: hash value to find closest match for + * + * Find the closest child of @node with a hash greater than or equal to @hash. + * The closest match is the left-most child of @node with this property. Which + * means, it is the first child with that hash returned by + * kdbus_node_next_child(), if you'd iterate the whole parent node. + * + * Return: Reference to acquired child, or NULL if none found. + */ +struct kdbus_node *kdbus_node_find_closest(struct kdbus_node *node, + unsigned int hash) +{ + struct kdbus_node *child; + struct rb_node *rb; + + mutex_lock(&node->lock); + + child = node_find_closest_unlocked(node, hash, NULL); + while (child && !kdbus_node_acquire(child)) { + rb = rb_next(&child->rb); + if (rb) + child = kdbus_node_from_rb(rb); + else + child = NULL; + } + kdbus_node_ref(child); + + mutex_unlock(&node->lock); + + return child; +} + +/** + * kdbus_node_next_child() - Acquire next child + * @node: parent node + * @prev: previous child-node position or NULL + * + * This function returns a reference to the next active child of @node, after + * the passed position @prev. If @prev is NULL, a reference to the first active + * child is returned. If no more active children are found, NULL is returned. + * + * This function acquires the next child it returns. If you're done with the + * returned pointer, you need to release _and_ unref it. + * + * The passed in pointer @prev is not modified by this function, and it does + * *not* have to be active. If @prev was acquired via different means, or if it + * was unlinked from its parent before you pass it in, then this iterator will + * still return the next active child (it will have to search through the + * rb-tree based on the node-name, though). + * However, @prev must not be linked to a different parent than @node! + * + * Return: Reference to next acquired child, or NULL if at the end. + */ +struct kdbus_node *kdbus_node_next_child(struct kdbus_node *node, + struct kdbus_node *prev) +{ + struct kdbus_node *pos = NULL; + struct rb_node *rb; + + mutex_lock(&node->lock); + + if (!prev) { + /* + * New iteration; find first node in rb-tree and try to acquire + * it. If we got it, directly return it as first element. + * Otherwise, the loop below will find the next active node. + */ + rb = rb_first(&node->children); + if (!rb) + goto exit; + pos = kdbus_node_from_rb(rb); + if (kdbus_node_acquire(pos)) + goto exit; + } else { + /* + * The current iterator is still linked to the parent. Set it + * as current position and use the loop below to find the next + * active element. + */ + pos = prev; + } + + /* @pos was already returned or is inactive; find next active node */ + do { + rb = rb_next(&pos->rb); + if (rb) + pos = kdbus_node_from_rb(rb); + else + pos = NULL; + } while (pos && !kdbus_node_acquire(pos)); + +exit: + /* @pos is NULL or acquired. Take ref if non-NULL and return it */ + kdbus_node_ref(pos); + mutex_unlock(&node->lock); + return pos; +} diff -Nur linux-4.2/ipc/kdbus/node.h linux-4.2-pck/ipc/kdbus/node.h --- linux-4.2/ipc/kdbus/node.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/node.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_NODE_H +#define __KDBUS_NODE_H + +#include +#include +#include +#include + +struct kdbus_node; + +enum kdbus_node_type { + KDBUS_NODE_DOMAIN, + KDBUS_NODE_CONTROL, + KDBUS_NODE_BUS, + KDBUS_NODE_ENDPOINT, +}; + +typedef void (*kdbus_node_free_t) (struct kdbus_node *node); +typedef void (*kdbus_node_release_t) (struct kdbus_node *node, bool was_active); + +struct kdbus_node { + struct mutex lock; + atomic_t refcnt; + atomic_t active; + wait_queue_head_t waitq; + + /* static members */ + unsigned int type; + kdbus_node_free_t free_cb; + kdbus_node_release_t release_cb; + umode_t mode; + kuid_t uid; + kgid_t gid; + + /* valid once linked */ + char *name; + unsigned int hash; + unsigned int id; + struct kdbus_node *parent; /* may be NULL */ + struct rb_node rb; + + /* dynamic list of children */ + struct rb_root children; +}; + +#define kdbus_node_from_rb(_node) rb_entry((_node), struct kdbus_node, rb) + +extern struct ida kdbus_node_ida; + +void kdbus_node_init(struct kdbus_node *node, unsigned int type); + +int kdbus_node_link(struct kdbus_node *node, struct kdbus_node *parent, + const char *name); + +struct kdbus_node *kdbus_node_ref(struct kdbus_node *node); +struct kdbus_node *kdbus_node_unref(struct kdbus_node *node); + +bool kdbus_node_is_active(struct kdbus_node *node); +bool kdbus_node_is_deactivated(struct kdbus_node *node); +bool kdbus_node_activate(struct kdbus_node *node); +void kdbus_node_deactivate(struct kdbus_node *node); +void kdbus_node_drain(struct kdbus_node *node); + +bool kdbus_node_acquire(struct kdbus_node *node); +void kdbus_node_release(struct kdbus_node *node); + +struct kdbus_node *kdbus_node_find_child(struct kdbus_node *node, + const char *name); +struct kdbus_node *kdbus_node_find_closest(struct kdbus_node *node, + unsigned int hash); +struct kdbus_node *kdbus_node_next_child(struct kdbus_node *node, + struct kdbus_node *prev); + +#endif diff -Nur linux-4.2/ipc/kdbus/notify.c linux-4.2-pck/ipc/kdbus/notify.c --- linux-4.2/ipc/kdbus/notify.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/notify.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,204 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "domain.h" +#include "endpoint.h" +#include "item.h" +#include "message.h" +#include "notify.h" + +static inline void kdbus_notify_add_tail(struct kdbus_staging *staging, + struct kdbus_bus *bus) +{ + spin_lock(&bus->notify_lock); + list_add_tail(&staging->notify_entry, &bus->notify_list); + spin_unlock(&bus->notify_lock); +} + +static int kdbus_notify_reply(struct kdbus_bus *bus, u64 id, + u64 cookie, u64 msg_type) +{ + struct kdbus_staging *s; + + s = kdbus_staging_new_kernel(bus, id, cookie, 0, msg_type); + if (IS_ERR(s)) + return PTR_ERR(s); + + kdbus_notify_add_tail(s, bus); + return 0; +} + +/** + * kdbus_notify_reply_timeout() - queue a timeout reply + * @bus: Bus which queues the messages + * @id: The destination's connection ID + * @cookie: The cookie to set in the reply. + * + * Queues a message that has a KDBUS_ITEM_REPLY_TIMEOUT item attached. + * + * Return: 0 on success, negative errno on failure. + */ +int kdbus_notify_reply_timeout(struct kdbus_bus *bus, u64 id, u64 cookie) +{ + return kdbus_notify_reply(bus, id, cookie, KDBUS_ITEM_REPLY_TIMEOUT); +} + +/** + * kdbus_notify_reply_dead() - queue a 'dead' reply + * @bus: Bus which queues the messages + * @id: The destination's connection ID + * @cookie: The cookie to set in the reply. + * + * Queues a message that has a KDBUS_ITEM_REPLY_DEAD item attached. + * + * Return: 0 on success, negative errno on failure. + */ +int kdbus_notify_reply_dead(struct kdbus_bus *bus, u64 id, u64 cookie) +{ + return kdbus_notify_reply(bus, id, cookie, KDBUS_ITEM_REPLY_DEAD); +} + +/** + * kdbus_notify_name_change() - queue a notification about a name owner change + * @bus: Bus which queues the messages + * @type: The type if the notification; KDBUS_ITEM_NAME_ADD, + * KDBUS_ITEM_NAME_CHANGE or KDBUS_ITEM_NAME_REMOVE + * @old_id: The id of the connection that used to own the name + * @new_id: The id of the new owner connection + * @old_flags: The flags to pass in the KDBUS_ITEM flags field for + * the old owner + * @new_flags: The flags to pass in the KDBUS_ITEM flags field for + * the new owner + * @name: The name that was removed or assigned to a new owner + * + * Return: 0 on success, negative errno on failure. + */ +int kdbus_notify_name_change(struct kdbus_bus *bus, u64 type, + u64 old_id, u64 new_id, + u64 old_flags, u64 new_flags, + const char *name) +{ + size_t name_len, extra_size; + struct kdbus_staging *s; + + name_len = strlen(name) + 1; + extra_size = sizeof(struct kdbus_notify_name_change) + name_len; + + s = kdbus_staging_new_kernel(bus, KDBUS_DST_ID_BROADCAST, 0, + extra_size, type); + if (IS_ERR(s)) + return PTR_ERR(s); + + s->notify->name_change.old_id.id = old_id; + s->notify->name_change.old_id.flags = old_flags; + s->notify->name_change.new_id.id = new_id; + s->notify->name_change.new_id.flags = new_flags; + memcpy(s->notify->name_change.name, name, name_len); + + kdbus_notify_add_tail(s, bus); + return 0; +} + +/** + * kdbus_notify_id_change() - queue a notification about a unique ID change + * @bus: Bus which queues the messages + * @type: The type if the notification; KDBUS_ITEM_ID_ADD or + * KDBUS_ITEM_ID_REMOVE + * @id: The id of the connection that was added or removed + * @flags: The flags to pass in the KDBUS_ITEM flags field + * + * Return: 0 on success, negative errno on failure. + */ +int kdbus_notify_id_change(struct kdbus_bus *bus, u64 type, u64 id, u64 flags) +{ + struct kdbus_staging *s; + size_t extra_size; + + extra_size = sizeof(struct kdbus_notify_id_change); + s = kdbus_staging_new_kernel(bus, KDBUS_DST_ID_BROADCAST, 0, + extra_size, type); + if (IS_ERR(s)) + return PTR_ERR(s); + + s->notify->id_change.id = id; + s->notify->id_change.flags = flags; + + kdbus_notify_add_tail(s, bus); + return 0; +} + +/** + * kdbus_notify_flush() - send a list of collected messages + * @bus: Bus which queues the messages + * + * The list is empty after sending the messages. + */ +void kdbus_notify_flush(struct kdbus_bus *bus) +{ + LIST_HEAD(notify_list); + struct kdbus_staging *s, *tmp; + + mutex_lock(&bus->notify_flush_lock); + down_read(&bus->name_registry->rwlock); + + spin_lock(&bus->notify_lock); + list_splice_init(&bus->notify_list, ¬ify_list); + spin_unlock(&bus->notify_lock); + + list_for_each_entry_safe(s, tmp, ¬ify_list, notify_entry) { + if (s->msg->dst_id != KDBUS_DST_ID_BROADCAST) { + struct kdbus_conn *conn; + + conn = kdbus_bus_find_conn_by_id(bus, s->msg->dst_id); + if (conn) { + kdbus_bus_eavesdrop(bus, NULL, s); + kdbus_conn_entry_insert(NULL, conn, s, NULL, + NULL); + kdbus_conn_unref(conn); + } + } else { + kdbus_bus_broadcast(bus, NULL, s); + } + + list_del(&s->notify_entry); + kdbus_staging_free(s); + } + + up_read(&bus->name_registry->rwlock); + mutex_unlock(&bus->notify_flush_lock); +} + +/** + * kdbus_notify_free() - free a list of collected messages + * @bus: Bus which queues the messages + */ +void kdbus_notify_free(struct kdbus_bus *bus) +{ + struct kdbus_staging *s, *tmp; + + list_for_each_entry_safe(s, tmp, &bus->notify_list, notify_entry) { + list_del(&s->notify_entry); + kdbus_staging_free(s); + } +} diff -Nur linux-4.2/ipc/kdbus/notify.h linux-4.2-pck/ipc/kdbus/notify.h --- linux-4.2/ipc/kdbus/notify.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/notify.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_NOTIFY_H +#define __KDBUS_NOTIFY_H + +struct kdbus_bus; + +int kdbus_notify_id_change(struct kdbus_bus *bus, u64 type, u64 id, u64 flags); +int kdbus_notify_reply_timeout(struct kdbus_bus *bus, u64 id, u64 cookie); +int kdbus_notify_reply_dead(struct kdbus_bus *bus, u64 id, u64 cookie); +int kdbus_notify_name_change(struct kdbus_bus *bus, u64 type, + u64 old_id, u64 new_id, + u64 old_flags, u64 new_flags, + const char *name); +void kdbus_notify_flush(struct kdbus_bus *bus); +void kdbus_notify_free(struct kdbus_bus *bus); + +#endif diff -Nur linux-4.2/ipc/kdbus/policy.c linux-4.2-pck/ipc/kdbus/policy.c --- linux-4.2/ipc/kdbus/policy.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/policy.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,489 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "domain.h" +#include "item.h" +#include "names.h" +#include "policy.h" + +#define KDBUS_POLICY_HASH_SIZE 64 + +/** + * struct kdbus_policy_db_entry_access - a database entry access item + * @type: One of KDBUS_POLICY_ACCESS_* types + * @access: Access to grant. One of KDBUS_POLICY_* + * @uid: For KDBUS_POLICY_ACCESS_USER, the global uid + * @gid: For KDBUS_POLICY_ACCESS_GROUP, the global gid + * @list: List entry item for the entry's list + * + * This is the internal version of struct kdbus_policy_db_access. + */ +struct kdbus_policy_db_entry_access { + u8 type; /* USER, GROUP, WORLD */ + u8 access; /* OWN, TALK, SEE */ + union { + kuid_t uid; /* global uid */ + kgid_t gid; /* global gid */ + }; + struct list_head list; +}; + +/** + * struct kdbus_policy_db_entry - a policy database entry + * @name: The name to match the policy entry against + * @hentry: The hash entry for the database's entries_hash + * @access_list: List head for keeping tracks of the entry's + * access items. + * @owner: The owner of this entry. Can be a kdbus_conn or + * a kdbus_ep object. + * @wildcard: The name is a wildcard, such as ending on '.*' + */ +struct kdbus_policy_db_entry { + char *name; + struct hlist_node hentry; + struct list_head access_list; + const void *owner; + bool wildcard:1; +}; + +static void kdbus_policy_entry_free(struct kdbus_policy_db_entry *e) +{ + struct kdbus_policy_db_entry_access *a, *tmp; + + list_for_each_entry_safe(a, tmp, &e->access_list, list) { + list_del(&a->list); + kfree(a); + } + + kfree(e->name); + kfree(e); +} + +static unsigned int kdbus_strnhash(const char *str, size_t len) +{ + unsigned long hash = init_name_hash(); + + while (len--) + hash = partial_name_hash(*str++, hash); + + return end_name_hash(hash); +} + +static const struct kdbus_policy_db_entry * +kdbus_policy_lookup(struct kdbus_policy_db *db, const char *name, u32 hash) +{ + struct kdbus_policy_db_entry *e; + const char *dot; + size_t len; + + /* find exact match */ + hash_for_each_possible(db->entries_hash, e, hentry, hash) + if (strcmp(e->name, name) == 0 && !e->wildcard) + return e; + + /* find wildcard match */ + + dot = strrchr(name, '.'); + if (!dot) + return NULL; + + len = dot - name; + hash = kdbus_strnhash(name, len); + + hash_for_each_possible(db->entries_hash, e, hentry, hash) + if (e->wildcard && !strncmp(e->name, name, len) && + !e->name[len]) + return e; + + return NULL; +} + +/** + * kdbus_policy_db_clear - release all memory from a policy db + * @db: The policy database + */ +void kdbus_policy_db_clear(struct kdbus_policy_db *db) +{ + struct kdbus_policy_db_entry *e; + struct hlist_node *tmp; + unsigned int i; + + /* purge entries */ + down_write(&db->entries_rwlock); + hash_for_each_safe(db->entries_hash, i, tmp, e, hentry) { + hash_del(&e->hentry); + kdbus_policy_entry_free(e); + } + up_write(&db->entries_rwlock); +} + +/** + * kdbus_policy_db_init() - initialize a new policy database + * @db: The location of the database + * + * This initializes a new policy-db. The underlying memory must have been + * cleared to zero by the caller. + */ +void kdbus_policy_db_init(struct kdbus_policy_db *db) +{ + hash_init(db->entries_hash); + init_rwsem(&db->entries_rwlock); +} + +/** + * kdbus_policy_query_unlocked() - Query the policy database + * @db: Policy database + * @cred: Credentials to test against + * @name: Name to query + * @hash: Hash value of @name + * + * Same as kdbus_policy_query() but requires the caller to lock the policy + * database against concurrent writes. + * + * Return: The highest KDBUS_POLICY_* access type found, or -EPERM if none. + */ +int kdbus_policy_query_unlocked(struct kdbus_policy_db *db, + const struct cred *cred, const char *name, + unsigned int hash) +{ + struct kdbus_policy_db_entry_access *a; + const struct kdbus_policy_db_entry *e; + int i, highest = -EPERM; + + e = kdbus_policy_lookup(db, name, hash); + if (!e) + return -EPERM; + + list_for_each_entry(a, &e->access_list, list) { + if ((int)a->access <= highest) + continue; + + switch (a->type) { + case KDBUS_POLICY_ACCESS_USER: + if (uid_eq(cred->euid, a->uid)) + highest = a->access; + break; + case KDBUS_POLICY_ACCESS_GROUP: + if (gid_eq(cred->egid, a->gid)) { + highest = a->access; + break; + } + + for (i = 0; i < cred->group_info->ngroups; i++) { + kgid_t gid = GROUP_AT(cred->group_info, i); + + if (gid_eq(gid, a->gid)) { + highest = a->access; + break; + } + } + + break; + case KDBUS_POLICY_ACCESS_WORLD: + highest = a->access; + break; + } + + /* OWN is the highest possible policy */ + if (highest >= KDBUS_POLICY_OWN) + break; + } + + return highest; +} + +/** + * kdbus_policy_query() - Query the policy database + * @db: Policy database + * @cred: Credentials to test against + * @name: Name to query + * @hash: Hash value of @name + * + * Query the policy database @db for the access rights of @cred to the name + * @name. The access rights of @cred are returned, or -EPERM if no access is + * granted. + * + * This call effectively searches for the highest access-right granted to + * @cred. The caller should really cache those as policy lookups are rather + * expensive. + * + * Return: The highest KDBUS_POLICY_* access type found, or -EPERM if none. + */ +int kdbus_policy_query(struct kdbus_policy_db *db, const struct cred *cred, + const char *name, unsigned int hash) +{ + int ret; + + down_read(&db->entries_rwlock); + ret = kdbus_policy_query_unlocked(db, cred, name, hash); + up_read(&db->entries_rwlock); + + return ret; +} + +static void __kdbus_policy_remove_owner(struct kdbus_policy_db *db, + const void *owner) +{ + struct kdbus_policy_db_entry *e; + struct hlist_node *tmp; + int i; + + hash_for_each_safe(db->entries_hash, i, tmp, e, hentry) + if (e->owner == owner) { + hash_del(&e->hentry); + kdbus_policy_entry_free(e); + } +} + +/** + * kdbus_policy_remove_owner() - remove all entries related to a connection + * @db: The policy database + * @owner: The connection which items to remove + */ +void kdbus_policy_remove_owner(struct kdbus_policy_db *db, + const void *owner) +{ + down_write(&db->entries_rwlock); + __kdbus_policy_remove_owner(db, owner); + up_write(&db->entries_rwlock); +} + +/* + * Convert user provided policy access to internal kdbus policy + * access + */ +static struct kdbus_policy_db_entry_access * +kdbus_policy_make_access(const struct kdbus_policy_access *uaccess) +{ + int ret; + struct kdbus_policy_db_entry_access *a; + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return ERR_PTR(-ENOMEM); + + ret = -EINVAL; + switch (uaccess->access) { + case KDBUS_POLICY_SEE: + case KDBUS_POLICY_TALK: + case KDBUS_POLICY_OWN: + a->access = uaccess->access; + break; + default: + goto err; + } + + switch (uaccess->type) { + case KDBUS_POLICY_ACCESS_USER: + a->uid = make_kuid(current_user_ns(), uaccess->id); + if (!uid_valid(a->uid)) + goto err; + + break; + case KDBUS_POLICY_ACCESS_GROUP: + a->gid = make_kgid(current_user_ns(), uaccess->id); + if (!gid_valid(a->gid)) + goto err; + + break; + case KDBUS_POLICY_ACCESS_WORLD: + break; + default: + goto err; + } + + a->type = uaccess->type; + + return a; + +err: + kfree(a); + return ERR_PTR(ret); +} + +/** + * kdbus_policy_set() - set a connection's policy rules + * @db: The policy database + * @items: A list of kdbus_item elements that contain both + * names and access rules to set. + * @items_size: The total size of the items. + * @max_policies: The maximum number of policy entries to allow. + * Pass 0 for no limit. + * @allow_wildcards: Boolean value whether wildcard entries (such + * ending on '.*') should be allowed. + * @owner: The owner of the new policy items. + * + * This function sets a new set of policies for a given owner. The names and + * access rules are gathered by walking the list of items passed in as + * argument. An item of type KDBUS_ITEM_NAME is expected before any number of + * KDBUS_ITEM_POLICY_ACCESS items. If there are more repetitions of this + * pattern than denoted in @max_policies, -EINVAL is returned. + * + * In order to allow atomic replacement of rules, the function first removes + * all entries that have been created for the given owner previously. + * + * Callers to this function must make sure that the owner is a custom + * endpoint, or if the endpoint is a default endpoint, then it must be + * either a policy holder or an activator. + * + * Return: 0 on success, negative errno on failure. + */ +int kdbus_policy_set(struct kdbus_policy_db *db, + const struct kdbus_item *items, + size_t items_size, + size_t max_policies, + bool allow_wildcards, + const void *owner) +{ + struct kdbus_policy_db_entry_access *a; + struct kdbus_policy_db_entry *e, *p; + const struct kdbus_item *item; + struct hlist_node *tmp; + HLIST_HEAD(entries); + HLIST_HEAD(restore); + size_t count = 0; + int i, ret = 0; + u32 hash; + + /* Walk the list of items and look for new policies */ + e = NULL; + KDBUS_ITEMS_FOREACH(item, items, items_size) { + switch (item->type) { + case KDBUS_ITEM_NAME: { + size_t len; + + if (max_policies && ++count > max_policies) { + ret = -E2BIG; + goto exit; + } + + if (!kdbus_name_is_valid(item->str, true)) { + ret = -EINVAL; + goto exit; + } + + e = kzalloc(sizeof(*e), GFP_KERNEL); + if (!e) { + ret = -ENOMEM; + goto exit; + } + + INIT_LIST_HEAD(&e->access_list); + e->owner = owner; + hlist_add_head(&e->hentry, &entries); + + e->name = kstrdup(item->str, GFP_KERNEL); + if (!e->name) { + ret = -ENOMEM; + goto exit; + } + + /* + * If a supplied name ends with an '.*', cut off that + * part, only store anything before it, and mark the + * entry as wildcard. + */ + len = strlen(e->name); + if (len > 2 && + e->name[len - 3] == '.' && + e->name[len - 2] == '*') { + if (!allow_wildcards) { + ret = -EINVAL; + goto exit; + } + + e->name[len - 3] = '\0'; + e->wildcard = true; + } + + break; + } + + case KDBUS_ITEM_POLICY_ACCESS: + if (!e) { + ret = -EINVAL; + goto exit; + } + + a = kdbus_policy_make_access(&item->policy_access); + if (IS_ERR(a)) { + ret = PTR_ERR(a); + goto exit; + } + + list_add_tail(&a->list, &e->access_list); + break; + } + } + + down_write(&db->entries_rwlock); + + /* remember previous entries to restore in case of failure */ + hash_for_each_safe(db->entries_hash, i, tmp, e, hentry) + if (e->owner == owner) { + hash_del(&e->hentry); + hlist_add_head(&e->hentry, &restore); + } + + hlist_for_each_entry_safe(e, tmp, &entries, hentry) { + /* prevent duplicates */ + hash = kdbus_strhash(e->name); + hash_for_each_possible(db->entries_hash, p, hentry, hash) + if (strcmp(e->name, p->name) == 0 && + e->wildcard == p->wildcard) { + ret = -EEXIST; + goto restore; + } + + hlist_del(&e->hentry); + hash_add(db->entries_hash, &e->hentry, hash); + } + +restore: + /* if we failed, flush all entries we added so far */ + if (ret < 0) + __kdbus_policy_remove_owner(db, owner); + + /* if we failed, restore entries, otherwise release them */ + hlist_for_each_entry_safe(e, tmp, &restore, hentry) { + hlist_del(&e->hentry); + if (ret < 0) { + hash = kdbus_strhash(e->name); + hash_add(db->entries_hash, &e->hentry, hash); + } else { + kdbus_policy_entry_free(e); + } + } + + up_write(&db->entries_rwlock); + +exit: + hlist_for_each_entry_safe(e, tmp, &entries, hentry) { + hlist_del(&e->hentry); + kdbus_policy_entry_free(e); + } + + return ret; +} diff -Nur linux-4.2/ipc/kdbus/policy.h linux-4.2-pck/ipc/kdbus/policy.h --- linux-4.2/ipc/kdbus/policy.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/policy.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_POLICY_H +#define __KDBUS_POLICY_H + +#include +#include + +struct kdbus_conn; +struct kdbus_item; + +/** + * struct kdbus_policy_db - policy database + * @entries_hash: Hashtable of entries + * @entries_rwlock: Mutex to protect the database's access entries + */ +struct kdbus_policy_db { + DECLARE_HASHTABLE(entries_hash, 6); + struct rw_semaphore entries_rwlock; +}; + +void kdbus_policy_db_init(struct kdbus_policy_db *db); +void kdbus_policy_db_clear(struct kdbus_policy_db *db); + +int kdbus_policy_query_unlocked(struct kdbus_policy_db *db, + const struct cred *cred, const char *name, + unsigned int hash); +int kdbus_policy_query(struct kdbus_policy_db *db, const struct cred *cred, + const char *name, unsigned int hash); + +void kdbus_policy_remove_owner(struct kdbus_policy_db *db, + const void *owner); +int kdbus_policy_set(struct kdbus_policy_db *db, + const struct kdbus_item *items, + size_t items_size, + size_t max_policies, + bool allow_wildcards, + const void *owner); + +#endif diff -Nur linux-4.2/ipc/kdbus/pool.c linux-4.2-pck/ipc/kdbus/pool.c --- linux-4.2/ipc/kdbus/pool.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/pool.c 2015-09-08 00:55:40.120721602 -0300 @@ -0,0 +1,728 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pool.h" +#include "util.h" + +/** + * struct kdbus_pool - the receiver's buffer + * @f: The backing shmem file + * @size: The size of the file + * @accounted_size: Currently accounted memory in bytes + * @lock: Pool data lock + * @slices: All slices sorted by address + * @slices_busy: Tree of allocated slices + * @slices_free: Tree of free slices + * + * The receiver's buffer, managed as a pool of allocated and free + * slices containing the queued messages. + * + * Messages sent with KDBUS_CMD_SEND are copied directly by the + * sending process into the receiver's pool. + * + * Messages received with KDBUS_CMD_RECV just return the offset + * to the data placed in the pool. + * + * The internally allocated memory needs to be returned by the receiver + * with KDBUS_CMD_FREE. + */ +struct kdbus_pool { + struct file *f; + size_t size; + size_t accounted_size; + struct mutex lock; + + struct list_head slices; + struct rb_root slices_busy; + struct rb_root slices_free; +}; + +/** + * struct kdbus_pool_slice - allocated element in kdbus_pool + * @pool: Pool this slice belongs to + * @off: Offset of slice in the shmem file + * @size: Size of slice + * @entry: Entry in "all slices" list + * @rb_node: Entry in free or busy list + * @free: Unused slice + * @accounted: Accounted as queue slice + * @ref_kernel: Kernel holds a reference + * @ref_user: Userspace holds a reference + * + * The pool has one or more slices, always spanning the entire size of the + * pool. + * + * Every slice is an element in a list sorted by the buffer address, to + * provide access to the next neighbor slice. + * + * Every slice is member in either the busy or the free tree. The free + * tree is organized by slice size, the busy tree organized by buffer + * offset. + */ +struct kdbus_pool_slice { + struct kdbus_pool *pool; + size_t off; + size_t size; + + struct list_head entry; + struct rb_node rb_node; + + bool free:1; + bool accounted:1; + bool ref_kernel:1; + bool ref_user:1; +}; + +static struct kdbus_pool_slice *kdbus_pool_slice_new(struct kdbus_pool *pool, + size_t off, size_t size) +{ + struct kdbus_pool_slice *slice; + + slice = kzalloc(sizeof(*slice), GFP_KERNEL); + if (!slice) + return NULL; + + slice->pool = pool; + slice->off = off; + slice->size = size; + slice->free = true; + return slice; +} + +/* insert a slice into the free tree */ +static void kdbus_pool_add_free_slice(struct kdbus_pool *pool, + struct kdbus_pool_slice *slice) +{ + struct rb_node **n; + struct rb_node *pn = NULL; + + n = &pool->slices_free.rb_node; + while (*n) { + struct kdbus_pool_slice *pslice; + + pn = *n; + pslice = rb_entry(pn, struct kdbus_pool_slice, rb_node); + if (slice->size < pslice->size) + n = &pn->rb_left; + else + n = &pn->rb_right; + } + + rb_link_node(&slice->rb_node, pn, n); + rb_insert_color(&slice->rb_node, &pool->slices_free); +} + +/* insert a slice into the busy tree */ +static void kdbus_pool_add_busy_slice(struct kdbus_pool *pool, + struct kdbus_pool_slice *slice) +{ + struct rb_node **n; + struct rb_node *pn = NULL; + + n = &pool->slices_busy.rb_node; + while (*n) { + struct kdbus_pool_slice *pslice; + + pn = *n; + pslice = rb_entry(pn, struct kdbus_pool_slice, rb_node); + if (slice->off < pslice->off) + n = &pn->rb_left; + else if (slice->off > pslice->off) + n = &pn->rb_right; + else + BUG(); + } + + rb_link_node(&slice->rb_node, pn, n); + rb_insert_color(&slice->rb_node, &pool->slices_busy); +} + +static struct kdbus_pool_slice *kdbus_pool_find_slice(struct kdbus_pool *pool, + size_t off) +{ + struct rb_node *n; + + n = pool->slices_busy.rb_node; + while (n) { + struct kdbus_pool_slice *s; + + s = rb_entry(n, struct kdbus_pool_slice, rb_node); + if (off < s->off) + n = n->rb_left; + else if (off > s->off) + n = n->rb_right; + else + return s; + } + + return NULL; +} + +/** + * kdbus_pool_slice_alloc() - allocate memory from a pool + * @pool: The receiver's pool + * @size: The number of bytes to allocate + * @accounted: Whether this slice should be accounted for + * + * The returned slice is used for kdbus_pool_slice_release() to + * free the allocated memory. If either @kvec or @iovec is non-NULL, the data + * will be copied from kernel or userspace memory into the new slice at + * offset 0. + * + * Return: the allocated slice on success, ERR_PTR on failure. + */ +struct kdbus_pool_slice *kdbus_pool_slice_alloc(struct kdbus_pool *pool, + size_t size, bool accounted) +{ + size_t slice_size = KDBUS_ALIGN8(size); + struct rb_node *n, *found = NULL; + struct kdbus_pool_slice *s; + int ret = 0; + + if (WARN_ON(!size)) + return ERR_PTR(-EINVAL); + + /* search a free slice with the closest matching size */ + mutex_lock(&pool->lock); + n = pool->slices_free.rb_node; + while (n) { + s = rb_entry(n, struct kdbus_pool_slice, rb_node); + if (slice_size < s->size) { + found = n; + n = n->rb_left; + } else if (slice_size > s->size) { + n = n->rb_right; + } else { + found = n; + break; + } + } + + /* no slice with the minimum size found in the pool */ + if (!found) { + ret = -EXFULL; + goto exit_unlock; + } + + /* no exact match, use the closest one */ + if (!n) { + struct kdbus_pool_slice *s_new; + + s = rb_entry(found, struct kdbus_pool_slice, rb_node); + + /* split-off the remainder of the size to its own slice */ + s_new = kdbus_pool_slice_new(pool, s->off + slice_size, + s->size - slice_size); + if (!s_new) { + ret = -ENOMEM; + goto exit_unlock; + } + + list_add(&s_new->entry, &s->entry); + kdbus_pool_add_free_slice(pool, s_new); + + /* adjust our size now that we split-off another slice */ + s->size = slice_size; + } + + /* move slice from free to the busy tree */ + rb_erase(found, &pool->slices_free); + kdbus_pool_add_busy_slice(pool, s); + + WARN_ON(s->ref_kernel || s->ref_user); + + s->ref_kernel = true; + s->free = false; + s->accounted = accounted; + if (accounted) + pool->accounted_size += s->size; + mutex_unlock(&pool->lock); + + return s; + +exit_unlock: + mutex_unlock(&pool->lock); + return ERR_PTR(ret); +} + +static void __kdbus_pool_slice_release(struct kdbus_pool_slice *slice) +{ + struct kdbus_pool *pool = slice->pool; + + /* don't free the slice if either has a reference */ + if (slice->ref_kernel || slice->ref_user) + return; + + if (WARN_ON(slice->free)) + return; + + rb_erase(&slice->rb_node, &pool->slices_busy); + + /* merge with the next free slice */ + if (!list_is_last(&slice->entry, &pool->slices)) { + struct kdbus_pool_slice *s; + + s = list_entry(slice->entry.next, + struct kdbus_pool_slice, entry); + if (s->free) { + rb_erase(&s->rb_node, &pool->slices_free); + list_del(&s->entry); + slice->size += s->size; + kfree(s); + } + } + + /* merge with previous free slice */ + if (pool->slices.next != &slice->entry) { + struct kdbus_pool_slice *s; + + s = list_entry(slice->entry.prev, + struct kdbus_pool_slice, entry); + if (s->free) { + rb_erase(&s->rb_node, &pool->slices_free); + list_del(&slice->entry); + s->size += slice->size; + kfree(slice); + slice = s; + } + } + + slice->free = true; + kdbus_pool_add_free_slice(pool, slice); +} + +/** + * kdbus_pool_slice_release() - drop kernel-reference on allocated slice + * @slice: Slice allocated from the pool + * + * This releases the kernel-reference on the given slice. If the + * kernel-reference and the user-reference on a slice are dropped, the slice is + * returned to the pool. + * + * So far, we do not implement full ref-counting on slices. Each, kernel and + * user-space can have exactly one reference to a slice. If both are dropped at + * the same time, the slice is released. + */ +void kdbus_pool_slice_release(struct kdbus_pool_slice *slice) +{ + struct kdbus_pool *pool; + + if (!slice) + return; + + /* @slice may be freed, so keep local ptr to @pool */ + pool = slice->pool; + + mutex_lock(&pool->lock); + /* kernel must own a ref to @slice to drop it */ + WARN_ON(!slice->ref_kernel); + slice->ref_kernel = false; + /* no longer kernel-owned, de-account slice */ + if (slice->accounted && !WARN_ON(pool->accounted_size < slice->size)) + pool->accounted_size -= slice->size; + __kdbus_pool_slice_release(slice); + mutex_unlock(&pool->lock); +} + +/** + * kdbus_pool_release_offset() - release a public offset + * @pool: pool to operate on + * @off: offset to release + * + * This should be called whenever user-space frees a slice given to them. It + * verifies the slice is available and public, and then drops it. It ensures + * correct locking and barriers against queues. + * + * Return: 0 on success, ENXIO if the offset is invalid or not public. + */ +int kdbus_pool_release_offset(struct kdbus_pool *pool, size_t off) +{ + struct kdbus_pool_slice *slice; + int ret = 0; + + /* 'pool->size' is used as dummy offset for empty slices */ + if (off == pool->size) + return 0; + + mutex_lock(&pool->lock); + slice = kdbus_pool_find_slice(pool, off); + if (slice && slice->ref_user) { + slice->ref_user = false; + __kdbus_pool_slice_release(slice); + } else { + ret = -ENXIO; + } + mutex_unlock(&pool->lock); + + return ret; +} + +/** + * kdbus_pool_publish_empty() - publish empty slice to user-space + * @pool: pool to operate on + * @off: output storage for offset, or NULL + * @size: output storage for size, or NULL + * + * This is the same as kdbus_pool_slice_publish(), but uses a dummy slice with + * size 0. The returned offset points to the end of the pool and is never + * returned on real slices. + */ +void kdbus_pool_publish_empty(struct kdbus_pool *pool, u64 *off, u64 *size) +{ + if (off) + *off = pool->size; + if (size) + *size = 0; +} + +/** + * kdbus_pool_slice_publish() - publish slice to user-space + * @slice: The slice + * @out_offset: Output storage for offset, or NULL + * @out_size: Output storage for size, or NULL + * + * This prepares a slice to be published to user-space. + * + * This call combines the following operations: + * * the memory region is flushed so the user's memory view is consistent + * * the slice is marked as referenced by user-space, so user-space has to + * call KDBUS_CMD_FREE to release it + * * the offset and size of the slice are written to the given output + * arguments, if non-NULL + */ +void kdbus_pool_slice_publish(struct kdbus_pool_slice *slice, + u64 *out_offset, u64 *out_size) +{ + mutex_lock(&slice->pool->lock); + /* kernel must own a ref to @slice to gain a user-space ref */ + WARN_ON(!slice->ref_kernel); + slice->ref_user = true; + mutex_unlock(&slice->pool->lock); + + if (out_offset) + *out_offset = slice->off; + if (out_size) + *out_size = slice->size; +} + +/** + * kdbus_pool_slice_offset() - Get a slice's offset inside the pool + * @slice: Slice to return the offset of + * + * Return: The internal offset @slice inside the pool. + */ +off_t kdbus_pool_slice_offset(const struct kdbus_pool_slice *slice) +{ + return slice->off; +} + +/** + * kdbus_pool_slice_size() - get size of a pool slice + * @slice: slice to query + * + * Return: size of the given slice + */ +size_t kdbus_pool_slice_size(const struct kdbus_pool_slice *slice) +{ + return slice->size; +} + +/** + * kdbus_pool_new() - create a new pool + * @name: Name of the (deleted) file which shows up in + * /proc, used for debugging + * @size: Maximum size of the pool + * + * Return: a new kdbus_pool on success, ERR_PTR on failure. + */ +struct kdbus_pool *kdbus_pool_new(const char *name, size_t size) +{ + struct kdbus_pool_slice *s; + struct kdbus_pool *p; + struct file *f; + char *n = NULL; + int ret; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + if (name) { + n = kasprintf(GFP_KERNEL, KBUILD_MODNAME "-conn:%s", name); + if (!n) { + ret = -ENOMEM; + goto exit_free; + } + } + + f = shmem_file_setup(n ?: KBUILD_MODNAME "-conn", size, 0, 0); + kfree(n); + + if (IS_ERR(f)) { + ret = PTR_ERR(f); + goto exit_free; + } + + ret = get_write_access(file_inode(f)); + if (ret < 0) + goto exit_put_shmem; + + /* allocate first slice spanning the entire pool */ + s = kdbus_pool_slice_new(p, 0, size); + if (!s) { + ret = -ENOMEM; + goto exit_put_write; + } + + p->f = f; + p->size = size; + p->slices_free = RB_ROOT; + p->slices_busy = RB_ROOT; + mutex_init(&p->lock); + + INIT_LIST_HEAD(&p->slices); + list_add(&s->entry, &p->slices); + + kdbus_pool_add_free_slice(p, s); + return p; + +exit_put_write: + put_write_access(file_inode(f)); +exit_put_shmem: + fput(f); +exit_free: + kfree(p); + return ERR_PTR(ret); +} + +/** + * kdbus_pool_free() - destroy pool + * @pool: The receiver's pool + */ +void kdbus_pool_free(struct kdbus_pool *pool) +{ + struct kdbus_pool_slice *s, *tmp; + + if (!pool) + return; + + list_for_each_entry_safe(s, tmp, &pool->slices, entry) { + list_del(&s->entry); + kfree(s); + } + + put_write_access(file_inode(pool->f)); + fput(pool->f); + kfree(pool); +} + +/** + * kdbus_pool_accounted() - retrieve accounting information + * @pool: pool to query + * @size: output for overall pool size + * @acc: output for currently accounted size + * + * This returns accounting information of the pool. Note that the data might + * change after the function returns, as the pool lock is dropped. You need to + * protect the data via other means, if you need reliable accounting. + */ +void kdbus_pool_accounted(struct kdbus_pool *pool, size_t *size, size_t *acc) +{ + mutex_lock(&pool->lock); + if (size) + *size = pool->size; + if (acc) + *acc = pool->accounted_size; + mutex_unlock(&pool->lock); +} + +/** + * kdbus_pool_slice_copy_iovec() - copy user memory to a slice + * @slice: The slice to write to + * @off: Offset in the slice to write to + * @iov: iovec array, pointing to data to copy + * @iov_len: Number of elements in @iov + * @total_len: Total number of bytes described in members of @iov + * + * User memory referenced by @iov will be copied into @slice at offset @off. + * + * Return: the numbers of bytes copied, negative errno on failure. + */ +ssize_t +kdbus_pool_slice_copy_iovec(const struct kdbus_pool_slice *slice, loff_t off, + struct iovec *iov, size_t iov_len, size_t total_len) +{ + struct iov_iter iter; + ssize_t len; + + if (WARN_ON(off + total_len > slice->size)) + return -EFAULT; + + off += slice->off; + iov_iter_init(&iter, WRITE, iov, iov_len, total_len); + len = vfs_iter_write(slice->pool->f, &iter, &off); + + return (len >= 0 && len != total_len) ? -EFAULT : len; +} + +/** + * kdbus_pool_slice_copy_kvec() - copy kernel memory to a slice + * @slice: The slice to write to + * @off: Offset in the slice to write to + * @kvec: kvec array, pointing to data to copy + * @kvec_len: Number of elements in @kvec + * @total_len: Total number of bytes described in members of @kvec + * + * Kernel memory referenced by @kvec will be copied into @slice at offset @off. + * + * Return: the numbers of bytes copied, negative errno on failure. + */ +ssize_t kdbus_pool_slice_copy_kvec(const struct kdbus_pool_slice *slice, + loff_t off, struct kvec *kvec, + size_t kvec_len, size_t total_len) +{ + struct iov_iter iter; + mm_segment_t old_fs; + ssize_t len; + + if (WARN_ON(off + total_len > slice->size)) + return -EFAULT; + + off += slice->off; + iov_iter_kvec(&iter, WRITE | ITER_KVEC, kvec, kvec_len, total_len); + + old_fs = get_fs(); + set_fs(get_ds()); + len = vfs_iter_write(slice->pool->f, &iter, &off); + set_fs(old_fs); + + return (len >= 0 && len != total_len) ? -EFAULT : len; +} + +/** + * kdbus_pool_slice_copy() - copy data from one slice into another + * @slice_dst: destination slice + * @slice_src: source slice + * + * Return: 0 on success, negative error number on failure. + */ +int kdbus_pool_slice_copy(const struct kdbus_pool_slice *slice_dst, + const struct kdbus_pool_slice *slice_src) +{ + struct file *f_src = slice_src->pool->f; + struct file *f_dst = slice_dst->pool->f; + struct inode *i_dst = file_inode(f_dst); + struct address_space *mapping_dst = f_dst->f_mapping; + const struct address_space_operations *aops = mapping_dst->a_ops; + unsigned long len = slice_src->size; + loff_t off_src = slice_src->off; + loff_t off_dst = slice_dst->off; + mm_segment_t old_fs; + int ret = 0; + + if (WARN_ON(slice_src->size != slice_dst->size) || + WARN_ON(slice_src->free || slice_dst->free)) + return -EINVAL; + + mutex_lock(&i_dst->i_mutex); + old_fs = get_fs(); + set_fs(get_ds()); + while (len > 0) { + unsigned long page_off; + unsigned long copy_len; + char __user *kaddr; + struct page *page; + ssize_t n_read; + void *fsdata; + long status; + + page_off = off_dst & (PAGE_CACHE_SIZE - 1); + copy_len = min_t(unsigned long, + PAGE_CACHE_SIZE - page_off, len); + + status = aops->write_begin(f_dst, mapping_dst, off_dst, + copy_len, 0, &page, &fsdata); + if (unlikely(status < 0)) { + ret = status; + break; + } + + kaddr = (char __force __user *)kmap(page) + page_off; + n_read = __vfs_read(f_src, kaddr, copy_len, &off_src); + kunmap(page); + mark_page_accessed(page); + flush_dcache_page(page); + + if (unlikely(n_read != copy_len)) { + ret = -EFAULT; + break; + } + + status = aops->write_end(f_dst, mapping_dst, off_dst, + copy_len, copy_len, page, fsdata); + if (unlikely(status != copy_len)) { + ret = -EFAULT; + break; + } + + off_dst += copy_len; + len -= copy_len; + } + set_fs(old_fs); + mutex_unlock(&i_dst->i_mutex); + + return ret; +} + +/** + * kdbus_pool_mmap() - map the pool into the process + * @pool: The receiver's pool + * @vma: passed by mmap() syscall + * + * Return: the result of the mmap() call, negative errno on failure. + */ +int kdbus_pool_mmap(const struct kdbus_pool *pool, struct vm_area_struct *vma) +{ + /* deny write access to the pool */ + if (vma->vm_flags & VM_WRITE) + return -EPERM; + vma->vm_flags &= ~VM_MAYWRITE; + + /* do not allow to map more than the size of the file */ + if ((vma->vm_end - vma->vm_start) > pool->size) + return -EFAULT; + + /* replace the connection file with our shmem file */ + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = get_file(pool->f); + + return pool->f->f_op->mmap(pool->f, vma); +} diff -Nur linux-4.2/ipc/kdbus/pool.h linux-4.2-pck/ipc/kdbus/pool.h --- linux-4.2/ipc/kdbus/pool.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/pool.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_POOL_H +#define __KDBUS_POOL_H + +#include + +struct kdbus_pool; +struct kdbus_pool_slice; + +struct kdbus_pool *kdbus_pool_new(const char *name, size_t size); +void kdbus_pool_free(struct kdbus_pool *pool); +void kdbus_pool_accounted(struct kdbus_pool *pool, size_t *size, size_t *acc); +int kdbus_pool_mmap(const struct kdbus_pool *pool, struct vm_area_struct *vma); +int kdbus_pool_release_offset(struct kdbus_pool *pool, size_t off); +void kdbus_pool_publish_empty(struct kdbus_pool *pool, u64 *off, u64 *size); + +struct kdbus_pool_slice *kdbus_pool_slice_alloc(struct kdbus_pool *pool, + size_t size, bool accounted); +void kdbus_pool_slice_release(struct kdbus_pool_slice *slice); +void kdbus_pool_slice_publish(struct kdbus_pool_slice *slice, + u64 *out_offset, u64 *out_size); +off_t kdbus_pool_slice_offset(const struct kdbus_pool_slice *slice); +size_t kdbus_pool_slice_size(const struct kdbus_pool_slice *slice); +int kdbus_pool_slice_copy(const struct kdbus_pool_slice *slice_dst, + const struct kdbus_pool_slice *slice_src); +ssize_t kdbus_pool_slice_copy_kvec(const struct kdbus_pool_slice *slice, + loff_t off, struct kvec *kvec, + size_t kvec_count, size_t total_len); +ssize_t kdbus_pool_slice_copy_iovec(const struct kdbus_pool_slice *slice, + loff_t off, struct iovec *iov, + size_t iov_count, size_t total_len); + +#endif diff -Nur linux-4.2/ipc/kdbus/queue.c linux-4.2-pck/ipc/kdbus/queue.c --- linux-4.2/ipc/kdbus/queue.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/queue.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,363 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "domain.h" +#include "connection.h" +#include "item.h" +#include "message.h" +#include "metadata.h" +#include "queue.h" +#include "reply.h" + +/** + * kdbus_queue_init() - initialize data structure related to a queue + * @queue: The queue to initialize + */ +void kdbus_queue_init(struct kdbus_queue *queue) +{ + INIT_LIST_HEAD(&queue->msg_list); + queue->msg_prio_queue = RB_ROOT; +} + +/** + * kdbus_queue_peek() - Retrieves an entry from a queue + * @queue: The queue + * @priority: The minimum priority of the entry to peek + * @use_priority: Boolean flag whether or not to peek by priority + * + * Look for a entry in a queue, either by priority, or the oldest one (FIFO). + * The entry is not freed, put off the queue's lists or anything else. + * + * Return: the peeked queue entry on success, NULL if no suitable msg is found + */ +struct kdbus_queue_entry *kdbus_queue_peek(struct kdbus_queue *queue, + s64 priority, bool use_priority) +{ + struct kdbus_queue_entry *e; + + if (list_empty(&queue->msg_list)) + return NULL; + + if (use_priority) { + /* get next entry with highest priority */ + e = rb_entry(queue->msg_prio_highest, + struct kdbus_queue_entry, prio_node); + + /* no entry with the requested priority */ + if (e->priority > priority) + return NULL; + } else { + /* ignore the priority, return the next entry in the entry */ + e = list_first_entry(&queue->msg_list, + struct kdbus_queue_entry, entry); + } + + return e; +} + +static void kdbus_queue_entry_link(struct kdbus_queue_entry *entry) +{ + struct kdbus_queue *queue = &entry->conn->queue; + struct rb_node **n, *pn = NULL; + bool highest = true; + + lockdep_assert_held(&entry->conn->lock); + if (WARN_ON(!list_empty(&entry->entry))) + return; + + /* sort into priority entry tree */ + n = &queue->msg_prio_queue.rb_node; + while (*n) { + struct kdbus_queue_entry *e; + + pn = *n; + e = rb_entry(pn, struct kdbus_queue_entry, prio_node); + + /* existing node for this priority, add to its list */ + if (likely(entry->priority == e->priority)) { + list_add_tail(&entry->prio_entry, &e->prio_entry); + goto prio_done; + } + + if (entry->priority < e->priority) { + n = &pn->rb_left; + } else { + n = &pn->rb_right; + highest = false; + } + } + + /* cache highest-priority entry */ + if (highest) + queue->msg_prio_highest = &entry->prio_node; + + /* new node for this priority */ + rb_link_node(&entry->prio_node, pn, n); + rb_insert_color(&entry->prio_node, &queue->msg_prio_queue); + INIT_LIST_HEAD(&entry->prio_entry); + +prio_done: + /* add to unsorted fifo list */ + list_add_tail(&entry->entry, &queue->msg_list); +} + +static void kdbus_queue_entry_unlink(struct kdbus_queue_entry *entry) +{ + struct kdbus_queue *queue = &entry->conn->queue; + + lockdep_assert_held(&entry->conn->lock); + if (list_empty(&entry->entry)) + return; + + list_del_init(&entry->entry); + + if (list_empty(&entry->prio_entry)) { + /* + * Single entry for this priority, update cached + * highest-priority entry, remove the tree node. + */ + if (queue->msg_prio_highest == &entry->prio_node) + queue->msg_prio_highest = rb_next(&entry->prio_node); + + rb_erase(&entry->prio_node, &queue->msg_prio_queue); + } else { + struct kdbus_queue_entry *q; + + /* + * Multiple entries for this priority entry, get next one in + * the list. Update cached highest-priority entry, store the + * new one as the tree node. + */ + q = list_first_entry(&entry->prio_entry, + struct kdbus_queue_entry, prio_entry); + list_del(&entry->prio_entry); + + if (queue->msg_prio_highest == &entry->prio_node) + queue->msg_prio_highest = &q->prio_node; + + rb_replace_node(&entry->prio_node, &q->prio_node, + &queue->msg_prio_queue); + } +} + +/** + * kdbus_queue_entry_new() - allocate a queue entry + * @src: source connection, or NULL + * @dst: destination connection + * @s: staging object carrying the message + * + * Allocates a queue entry based on a given msg and allocate space for + * the message payload and the requested metadata in the connection's pool. + * The entry is not actually added to the queue's lists at this point. + * + * Return: the allocated entry on success, or an ERR_PTR on failures. + */ +struct kdbus_queue_entry *kdbus_queue_entry_new(struct kdbus_conn *src, + struct kdbus_conn *dst, + struct kdbus_staging *s) +{ + struct kdbus_queue_entry *entry; + int ret; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&entry->entry); + entry->priority = s->msg->priority; + entry->conn = kdbus_conn_ref(dst); + entry->gaps = kdbus_gaps_ref(s->gaps); + + entry->slice = kdbus_staging_emit(s, src, dst); + if (IS_ERR(entry->slice)) { + ret = PTR_ERR(entry->slice); + entry->slice = NULL; + goto error; + } + + entry->user = src ? kdbus_user_ref(src->user) : NULL; + return entry; + +error: + kdbus_queue_entry_free(entry); + return ERR_PTR(ret); +} + +/** + * kdbus_queue_entry_free() - free resources of an entry + * @entry: The entry to free + * + * Removes resources allocated by a queue entry, along with the entry itself. + * Note that the entry's slice is not freed at this point. + */ +void kdbus_queue_entry_free(struct kdbus_queue_entry *entry) +{ + if (!entry) + return; + + lockdep_assert_held(&entry->conn->lock); + + kdbus_queue_entry_unlink(entry); + kdbus_reply_unref(entry->reply); + + if (entry->slice) { + kdbus_conn_quota_dec(entry->conn, entry->user, + kdbus_pool_slice_size(entry->slice), + entry->gaps ? entry->gaps->n_fds : 0); + kdbus_pool_slice_release(entry->slice); + } + + kdbus_user_unref(entry->user); + kdbus_gaps_unref(entry->gaps); + kdbus_conn_unref(entry->conn); + kfree(entry); +} + +/** + * kdbus_queue_entry_install() - install message components into the + * receiver's process + * @entry: The queue entry to install + * @return_flags: Pointer to store the return flags for userspace + * @install_fds: Whether or not to install associated file descriptors + * + * Return: 0 on success. + */ +int kdbus_queue_entry_install(struct kdbus_queue_entry *entry, + u64 *return_flags, bool install_fds) +{ + bool incomplete_fds = false; + int ret; + + lockdep_assert_held(&entry->conn->lock); + + ret = kdbus_gaps_install(entry->gaps, entry->slice, &incomplete_fds); + if (ret < 0) + return ret; + + if (incomplete_fds) + *return_flags |= KDBUS_RECV_RETURN_INCOMPLETE_FDS; + return 0; +} + +/** + * kdbus_queue_entry_enqueue() - enqueue an entry + * @entry: entry to enqueue + * @reply: reply to link to this entry (or NULL if none) + * + * This enqueues an unqueued entry into the message queue of the linked + * connection. It also binds a reply object to the entry so we can remember it + * when the message is moved. + * + * Once this call returns (and the connection lock is released), this entry can + * be dequeued by the target connection. Note that the entry will not be removed + * from the queue until it is destroyed. + */ +void kdbus_queue_entry_enqueue(struct kdbus_queue_entry *entry, + struct kdbus_reply *reply) +{ + lockdep_assert_held(&entry->conn->lock); + + if (WARN_ON(entry->reply) || WARN_ON(!list_empty(&entry->entry))) + return; + + entry->reply = kdbus_reply_ref(reply); + kdbus_queue_entry_link(entry); +} + +/** + * kdbus_queue_entry_move() - move queue entry + * @e: queue entry to move + * @dst: destination connection to queue the entry on + * + * This moves a queue entry onto a different connection. It allocates a new + * slice on the target connection and copies the message over. If the copy + * succeeded, we move the entry from @src to @dst. + * + * On failure, the entry is left untouched. + * + * The queue entry must be queued right now, and after the call succeeds it will + * be queued on the destination, but no longer on the source. + * + * The caller must hold the connection lock of the source *and* destination. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_queue_entry_move(struct kdbus_queue_entry *e, + struct kdbus_conn *dst) +{ + struct kdbus_pool_slice *slice = NULL; + struct kdbus_conn *src = e->conn; + size_t size, fds; + int ret; + + lockdep_assert_held(&src->lock); + lockdep_assert_held(&dst->lock); + + if (WARN_ON(list_empty(&e->entry))) + return -EINVAL; + if (src == dst) + return 0; + + size = kdbus_pool_slice_size(e->slice); + fds = e->gaps ? e->gaps->n_fds : 0; + + ret = kdbus_conn_quota_inc(dst, e->user, size, fds); + if (ret < 0) + return ret; + + slice = kdbus_pool_slice_alloc(dst->pool, size, true); + if (IS_ERR(slice)) { + ret = PTR_ERR(slice); + slice = NULL; + goto error; + } + + ret = kdbus_pool_slice_copy(slice, e->slice); + if (ret < 0) + goto error; + + kdbus_queue_entry_unlink(e); + kdbus_conn_quota_dec(src, e->user, size, fds); + kdbus_pool_slice_release(e->slice); + kdbus_conn_unref(e->conn); + + e->slice = slice; + e->conn = kdbus_conn_ref(dst); + kdbus_queue_entry_link(e); + + return 0; + +error: + kdbus_pool_slice_release(slice); + kdbus_conn_quota_dec(dst, e->user, size, fds); + return ret; +} diff -Nur linux-4.2/ipc/kdbus/queue.h linux-4.2-pck/ipc/kdbus/queue.h --- linux-4.2/ipc/kdbus/queue.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/queue.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_QUEUE_H +#define __KDBUS_QUEUE_H + +#include +#include + +struct kdbus_conn; +struct kdbus_pool_slice; +struct kdbus_reply; +struct kdbus_staging; +struct kdbus_user; + +/** + * struct kdbus_queue - a connection's message queue + * @msg_list: List head for kdbus_queue_entry objects + * @msg_prio_queue: RB tree root for messages, sorted by priority + * @msg_prio_highest: Link to the RB node referencing the message with the + * highest priority in the tree. + */ +struct kdbus_queue { + struct list_head msg_list; + struct rb_root msg_prio_queue; + struct rb_node *msg_prio_highest; +}; + +/** + * struct kdbus_queue_entry - messages waiting to be read + * @entry: Entry in the connection's list + * @prio_node: Entry in the priority queue tree + * @prio_entry: Queue tree node entry in the list of one priority + * @priority: Message priority + * @dst_name_id: The sequence number of the name this message is + * addressed to, 0 for messages sent to an ID + * @conn: Connection this entry is queued on + * @gaps: Gaps object to fill message gaps at RECV time + * @user: User used for accounting + * @slice: Slice in the receiver's pool for the message + * @reply: The reply block if a reply to this message is expected + */ +struct kdbus_queue_entry { + struct list_head entry; + struct rb_node prio_node; + struct list_head prio_entry; + + s64 priority; + u64 dst_name_id; + + struct kdbus_conn *conn; + struct kdbus_gaps *gaps; + struct kdbus_user *user; + struct kdbus_pool_slice *slice; + struct kdbus_reply *reply; +}; + +void kdbus_queue_init(struct kdbus_queue *queue); +struct kdbus_queue_entry *kdbus_queue_peek(struct kdbus_queue *queue, + s64 priority, bool use_priority); + +struct kdbus_queue_entry *kdbus_queue_entry_new(struct kdbus_conn *src, + struct kdbus_conn *dst, + struct kdbus_staging *s); +void kdbus_queue_entry_free(struct kdbus_queue_entry *entry); +int kdbus_queue_entry_install(struct kdbus_queue_entry *entry, + u64 *return_flags, bool install_fds); +void kdbus_queue_entry_enqueue(struct kdbus_queue_entry *entry, + struct kdbus_reply *reply); +int kdbus_queue_entry_move(struct kdbus_queue_entry *entry, + struct kdbus_conn *dst); + +#endif /* __KDBUS_QUEUE_H */ diff -Nur linux-4.2/ipc/kdbus/reply.c linux-4.2-pck/ipc/kdbus/reply.c --- linux-4.2/ipc/kdbus/reply.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/reply.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,252 @@ +#include +#include +#include +#include +#include +#include + +#include "bus.h" +#include "connection.h" +#include "endpoint.h" +#include "message.h" +#include "metadata.h" +#include "names.h" +#include "domain.h" +#include "item.h" +#include "notify.h" +#include "policy.h" +#include "reply.h" +#include "util.h" + +/** + * kdbus_reply_new() - Allocate and set up a new kdbus_reply object + * @reply_src: The connection a reply is expected from + * @reply_dst: The connection this reply object belongs to + * @msg: Message associated with the reply + * @name_entry: Name entry used to send the message + * @sync: Whether or not to make this reply synchronous + * + * Allocate and fill a new kdbus_reply object. + * + * Return: New kdbus_conn object on success, ERR_PTR on error. + */ +struct kdbus_reply *kdbus_reply_new(struct kdbus_conn *reply_src, + struct kdbus_conn *reply_dst, + const struct kdbus_msg *msg, + struct kdbus_name_entry *name_entry, + bool sync) +{ + struct kdbus_reply *r; + int ret; + + if (atomic_inc_return(&reply_dst->request_count) > + KDBUS_CONN_MAX_REQUESTS_PENDING) { + ret = -EMLINK; + goto exit_dec_request_count; + } + + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) { + ret = -ENOMEM; + goto exit_dec_request_count; + } + + kref_init(&r->kref); + INIT_LIST_HEAD(&r->entry); + r->reply_src = kdbus_conn_ref(reply_src); + r->reply_dst = kdbus_conn_ref(reply_dst); + r->cookie = msg->cookie; + r->name_id = name_entry ? name_entry->name_id : 0; + r->deadline_ns = msg->timeout_ns; + + if (sync) { + r->sync = true; + r->waiting = true; + } + + return r; + +exit_dec_request_count: + atomic_dec(&reply_dst->request_count); + return ERR_PTR(ret); +} + +static void __kdbus_reply_free(struct kref *kref) +{ + struct kdbus_reply *reply = + container_of(kref, struct kdbus_reply, kref); + + atomic_dec(&reply->reply_dst->request_count); + kdbus_conn_unref(reply->reply_src); + kdbus_conn_unref(reply->reply_dst); + kfree(reply); +} + +/** + * kdbus_reply_ref() - Increase reference on kdbus_reply + * @r: The reply, may be %NULL + * + * Return: The reply object with an extra reference + */ +struct kdbus_reply *kdbus_reply_ref(struct kdbus_reply *r) +{ + if (r) + kref_get(&r->kref); + return r; +} + +/** + * kdbus_reply_unref() - Decrease reference on kdbus_reply + * @r: The reply, may be %NULL + * + * Return: NULL + */ +struct kdbus_reply *kdbus_reply_unref(struct kdbus_reply *r) +{ + if (r) + kref_put(&r->kref, __kdbus_reply_free); + return NULL; +} + +/** + * kdbus_reply_link() - Link reply object into target connection + * @r: Reply to link + */ +void kdbus_reply_link(struct kdbus_reply *r) +{ + if (WARN_ON(!list_empty(&r->entry))) + return; + + list_add(&r->entry, &r->reply_dst->reply_list); + kdbus_reply_ref(r); +} + +/** + * kdbus_reply_unlink() - Unlink reply object from target connection + * @r: Reply to unlink + */ +void kdbus_reply_unlink(struct kdbus_reply *r) +{ + if (!list_empty(&r->entry)) { + list_del_init(&r->entry); + kdbus_reply_unref(r); + } +} + +/** + * kdbus_sync_reply_wakeup() - Wake a synchronously blocking reply + * @reply: The reply object + * @err: Error code to set on the remote side + * + * Wake up remote peer (method origin) with the appropriate synchronous reply + * code. + */ +void kdbus_sync_reply_wakeup(struct kdbus_reply *reply, int err) +{ + if (WARN_ON(!reply->sync)) + return; + + reply->waiting = false; + reply->err = err; + wake_up_interruptible(&reply->reply_dst->wait); +} + +/** + * kdbus_reply_find() - Find the corresponding reply object + * @replying: The replying connection or NULL + * @reply_dst: The connection the reply will be sent to + * (method origin) + * @cookie: The cookie of the requesting message + * + * Lookup a reply object that should be sent as a reply by + * @replying to @reply_dst with the given cookie. + * + * Callers must take the @reply_dst lock. + * + * Return: the corresponding reply object or NULL if not found + */ +struct kdbus_reply *kdbus_reply_find(struct kdbus_conn *replying, + struct kdbus_conn *reply_dst, + u64 cookie) +{ + struct kdbus_reply *r; + + list_for_each_entry(r, &reply_dst->reply_list, entry) { + if (r->cookie == cookie && + (!replying || r->reply_src == replying)) + return r; + } + + return NULL; +} + +/** + * kdbus_reply_list_scan_work() - Worker callback to scan the replies of a + * connection for exceeded timeouts + * @work: Work struct of the connection to scan + * + * Walk the list of replies stored with a connection and look for entries + * that have exceeded their timeout. If such an entry is found, a timeout + * notification is sent to the waiting peer, and the reply is removed from + * the list. + * + * The work is rescheduled to the nearest timeout found during the list + * iteration. + */ +void kdbus_reply_list_scan_work(struct work_struct *work) +{ + struct kdbus_conn *conn = + container_of(work, struct kdbus_conn, work.work); + struct kdbus_reply *reply, *reply_tmp; + u64 deadline = ~0ULL; + u64 now; + + now = ktime_get_ns(); + + mutex_lock(&conn->lock); + if (!kdbus_conn_active(conn)) { + mutex_unlock(&conn->lock); + return; + } + + list_for_each_entry_safe(reply, reply_tmp, &conn->reply_list, entry) { + /* + * If the reply block is waiting for synchronous I/O, + * the timeout is handled by wait_event_*_timeout(), + * so we don't have to care for it here. + */ + if (reply->sync && !reply->interrupted) + continue; + + WARN_ON(reply->reply_dst != conn); + + if (reply->deadline_ns > now) { + /* remember next timeout */ + if (deadline > reply->deadline_ns) + deadline = reply->deadline_ns; + + continue; + } + + /* + * A zero deadline means the connection died, was + * cleaned up already and the notification was sent. + * Don't send notifications for reply trackers that were + * left in an interrupted syscall state. + */ + if (reply->deadline_ns != 0 && !reply->interrupted) + kdbus_notify_reply_timeout(conn->ep->bus, conn->id, + reply->cookie); + + kdbus_reply_unlink(reply); + } + + /* rearm delayed work with next timeout */ + if (deadline != ~0ULL) + schedule_delayed_work(&conn->work, + nsecs_to_jiffies(deadline - now)); + + mutex_unlock(&conn->lock); + + kdbus_notify_flush(conn->ep->bus); +} diff -Nur linux-4.2/ipc/kdbus/reply.h linux-4.2-pck/ipc/kdbus/reply.h --- linux-4.2/ipc/kdbus/reply.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/reply.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_REPLY_H +#define __KDBUS_REPLY_H + +/** + * struct kdbus_reply - an entry of kdbus_conn's list of replies + * @kref: Ref-count of this object + * @entry: The entry of the connection's reply_list + * @reply_src: The connection the reply will be sent from + * @reply_dst: The connection the reply will be sent to + * @queue_entry: The queue entry item that is prepared by the replying + * connection + * @deadline_ns: The deadline of the reply, in nanoseconds + * @cookie: The cookie of the requesting message + * @name_id: ID of the well-known name the original msg was sent to + * @sync: The reply block is waiting for synchronous I/O + * @waiting: The condition to synchronously wait for + * @interrupted: The sync reply was left in an interrupted state + * @err: The error code for the synchronous reply + */ +struct kdbus_reply { + struct kref kref; + struct list_head entry; + struct kdbus_conn *reply_src; + struct kdbus_conn *reply_dst; + struct kdbus_queue_entry *queue_entry; + u64 deadline_ns; + u64 cookie; + u64 name_id; + bool sync:1; + bool waiting:1; + bool interrupted:1; + int err; +}; + +struct kdbus_reply *kdbus_reply_new(struct kdbus_conn *reply_src, + struct kdbus_conn *reply_dst, + const struct kdbus_msg *msg, + struct kdbus_name_entry *name_entry, + bool sync); + +struct kdbus_reply *kdbus_reply_ref(struct kdbus_reply *r); +struct kdbus_reply *kdbus_reply_unref(struct kdbus_reply *r); + +void kdbus_reply_link(struct kdbus_reply *r); +void kdbus_reply_unlink(struct kdbus_reply *r); + +struct kdbus_reply *kdbus_reply_find(struct kdbus_conn *replying, + struct kdbus_conn *reply_dst, + u64 cookie); + +void kdbus_sync_reply_wakeup(struct kdbus_reply *reply, int err); +void kdbus_reply_list_scan_work(struct work_struct *work); + +#endif /* __KDBUS_REPLY_H */ diff -Nur linux-4.2/ipc/kdbus/util.c linux-4.2-pck/ipc/kdbus/util.c --- linux-4.2/ipc/kdbus/util.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/util.c 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "limits.h" +#include "util.h" + +/** + * kdbus_copy_from_user() - copy aligned data from user-space + * @dest: target buffer in kernel memory + * @user_ptr: user-provided source buffer + * @size: memory size to copy from user + * + * This copies @size bytes from @user_ptr into the kernel, just like + * copy_from_user() does. But we enforce an 8-byte alignment and reject any + * unaligned user-space pointers. + * + * Return: 0 on success, negative error code on failure. + */ +int kdbus_copy_from_user(void *dest, void __user *user_ptr, size_t size) +{ + if (!KDBUS_IS_ALIGNED8((uintptr_t)user_ptr)) + return -EFAULT; + + if (copy_from_user(dest, user_ptr, size)) + return -EFAULT; + + return 0; +} + +/** + * kdbus_verify_uid_prefix() - verify UID prefix of a user-supplied name + * @name: user-supplied name to verify + * @user_ns: user-namespace to act in + * @kuid: Kernel internal uid of user + * + * This verifies that the user-supplied name @name has their UID as prefix. This + * is the default name-spacing policy we enforce on user-supplied names for + * public kdbus entities like buses and endpoints. + * + * The user must supply names prefixed with "-", whereas the UID is + * interpreted in the user-namespace of the domain. If the user fails to supply + * such a prefixed name, we reject it. + * + * Return: 0 on success, negative error code on failure + */ +int kdbus_verify_uid_prefix(const char *name, struct user_namespace *user_ns, + kuid_t kuid) +{ + uid_t uid; + char prefix[16]; + + /* + * The kuid must have a mapping into the userns of the domain + * otherwise do not allow creation of buses nor endpoints. + */ + uid = from_kuid(user_ns, kuid); + if (uid == (uid_t) -1) + return -EINVAL; + + snprintf(prefix, sizeof(prefix), "%u-", uid); + if (strncmp(name, prefix, strlen(prefix)) != 0) + return -EINVAL; + + return 0; +} + +/** + * kdbus_sanitize_attach_flags() - Sanitize attach flags from user-space + * @flags: Attach flags provided by userspace + * @attach_flags: A pointer where to store the valid attach flags + * + * Convert attach-flags provided by user-space into a valid mask. If the mask + * is invalid, an error is returned. The sanitized attach flags are stored in + * the output parameter. + * + * Return: 0 on success, negative error on failure. + */ +int kdbus_sanitize_attach_flags(u64 flags, u64 *attach_flags) +{ + /* 'any' degrades to 'all' for compatibility */ + if (flags == _KDBUS_ATTACH_ANY) + flags = _KDBUS_ATTACH_ALL; + + /* reject unknown attach flags */ + if (flags & ~_KDBUS_ATTACH_ALL) + return -EINVAL; + + *attach_flags = flags; + return 0; +} + +/** + * kdbus_kvec_set - helper utility to assemble kvec arrays + * @kvec: kvec entry to use + * @src: Source address to set in @kvec + * @len: Number of bytes in @src + * @total_len: Pointer to total length variable + * + * Set @src and @len in @kvec, and increase @total_len by @len. + */ +void kdbus_kvec_set(struct kvec *kvec, void *src, size_t len, u64 *total_len) +{ + kvec->iov_base = src; + kvec->iov_len = len; + *total_len += len; +} + +static const char * const zeros = "\0\0\0\0\0\0\0"; + +/** + * kdbus_kvec_pad - conditionally write a padding kvec + * @kvec: kvec entry to use + * @len: Total length used for kvec array + * + * Check if the current total byte length of the array in @len is aligned to + * 8 bytes. If it isn't, fill @kvec with padding information and increase @len + * by the number of bytes stored in @kvec. + * + * Return: the number of added padding bytes. + */ +size_t kdbus_kvec_pad(struct kvec *kvec, u64 *len) +{ + size_t pad = KDBUS_ALIGN8(*len) - *len; + + if (!pad) + return 0; + + kvec->iov_base = (void *)zeros; + kvec->iov_len = pad; + + *len += pad; + + return pad; +} diff -Nur linux-4.2/ipc/kdbus/util.h linux-4.2-pck/ipc/kdbus/util.h --- linux-4.2/ipc/kdbus/util.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/ipc/kdbus/util.h 2015-09-08 00:48:22.238363406 -0300 @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2013-2015 Greg Kroah-Hartman + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 David Herrmann + * Copyright (C) 2013-2015 Linux Foundation + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#ifndef __KDBUS_UTIL_H +#define __KDBUS_UTIL_H + +#include +#include + +#include + +/* all exported addresses are 64 bit */ +#define KDBUS_PTR(addr) ((void __user *)(uintptr_t)(addr)) + +/* all exported sizes are 64 bit and data aligned to 64 bit */ +#define KDBUS_ALIGN8(s) ALIGN((s), 8) +#define KDBUS_IS_ALIGNED8(s) (IS_ALIGNED(s, 8)) + +/** + * kdbus_member_set_user - write a structure member to user memory + * @_s: Variable to copy from + * @_b: Buffer to write to + * @_t: Structure type + * @_m: Member name in the passed structure + * + * Return: the result of copy_to_user() + */ +#define kdbus_member_set_user(_s, _b, _t, _m) \ +({ \ + u64 __user *_sz = \ + (void __user *)((u8 __user *)(_b) + offsetof(_t, _m)); \ + copy_to_user(_sz, _s, FIELD_SIZEOF(_t, _m)); \ +}) + +/** + * kdbus_strhash - calculate a hash + * @str: String + * + * Return: hash value + */ +static inline unsigned int kdbus_strhash(const char *str) +{ + unsigned long hash = init_name_hash(); + + while (*str) + hash = partial_name_hash(*str++, hash); + + return end_name_hash(hash); +} + +int kdbus_verify_uid_prefix(const char *name, struct user_namespace *user_ns, + kuid_t kuid); +int kdbus_sanitize_attach_flags(u64 flags, u64 *attach_flags); + +int kdbus_copy_from_user(void *dest, void __user *user_ptr, size_t size); + +struct kvec; + +void kdbus_kvec_set(struct kvec *kvec, void *src, size_t len, u64 *total_len); +size_t kdbus_kvec_pad(struct kvec *kvec, u64 *len); + +#endif diff -Nur linux-4.2/ipc/shm.c linux-4.2-pck/ipc/shm.c --- linux-4.2/ipc/shm.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/ipc/shm.c 2015-09-08 11:20:32.423680758 -0300 @@ -545,7 +545,7 @@ if ((shmflg & SHM_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) acctflag = VM_NORESERVE; - file = shmem_kernel_file_setup(name, size, acctflag); + file = shmem_kernel_file_setup(name, size, acctflag, 0); } error = PTR_ERR(file); if (IS_ERR(file)) diff -Nur linux-4.2/kernel/debug/kdb/kdb_io.c linux-4.2-pck/kernel/debug/kdb/kdb_io.c --- linux-4.2/kernel/debug/kdb/kdb_io.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/debug/kdb/kdb_io.c 2015-09-08 00:48:22.238363406 -0300 @@ -710,7 +710,7 @@ } } while (c) { - c->write(c, cp, retlen - (cp - kdb_buffer)); + c->write(c, cp, retlen - (cp - kdb_buffer), 7); /* 7 == KERN_DEBUG */ touch_nmi_watchdog(); c = c->next; } @@ -774,7 +774,7 @@ } } while (c) { - c->write(c, moreprompt, strlen(moreprompt)); + c->write(c, moreprompt, strlen(moreprompt), 7); /* 7 == KERN_DEBUG */ touch_nmi_watchdog(); c = c->next; } diff -Nur linux-4.2/kernel/fork.c linux-4.2-pck/kernel/fork.c --- linux-4.2/kernel/fork.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/fork.c 2015-09-08 11:20:32.430347100 -0300 @@ -138,7 +138,7 @@ static inline struct task_struct *alloc_task_struct_node(int node) { - return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); + return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL | ___GFP_TOI_NOTRACK, node); } static inline void free_task_struct(struct task_struct *tsk) @@ -443,7 +443,7 @@ goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -494,7 +494,7 @@ __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; - + uksm_vma_add_new(tmp); mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); diff -Nur linux-4.2/kernel/kthread.c linux-4.2-pck/kernel/kthread.c --- linux-4.2/kernel/kthread.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/kthread.c 2015-09-08 11:20:32.440346612 -0300 @@ -274,7 +274,7 @@ DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), - GFP_KERNEL); + GFP_KERNEL | ___GFP_TOI_NOTRACK); if (!create) return ERR_PTR(-ENOMEM); diff -Nur linux-4.2/kernel/power/Kconfig linux-4.2-pck/kernel/power/Kconfig --- linux-4.2/kernel/power/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/power/Kconfig 2015-09-08 11:20:32.450346125 -0300 @@ -91,6 +91,284 @@ suspended image to. It will simply pick the first available swap device. +menuconfig TOI_CORE + bool "Enhanced Hibernation (TuxOnIce)" + depends on HIBERNATION + default y + ---help--- + TuxOnIce is the 'new and improved' suspend support. + + See the TuxOnIce home page (tuxonice.net) + for FAQs, HOWTOs and other documentation. + + comment "Image Storage (you need at least one allocator)" + depends on TOI_CORE + + config TOI_FILE + bool "File Allocator" + depends on TOI_CORE + default y + ---help--- + This option enables support for storing an image in a + simple file. You might want this if your swap is + sometimes full enough that you don't have enough spare + space to store an image. + + config TOI_SWAP + bool "Swap Allocator" + depends on TOI_CORE && SWAP + default y + ---help--- + This option enables support for storing an image in your + swap space. + + comment "General Options" + depends on TOI_CORE + + config TOI_PRUNE + bool "Image pruning support" + depends on TOI_CORE && CRYPTO && BROKEN + default y + ---help--- + This option adds support for using cryptoapi hashing + algorithms to identify pages with the same content. We + then write a much smaller pointer to the first copy of + the data instead of a complete (perhaps compressed) + additional copy. + + You probably want this, so say Y here. + + comment "No image pruning support available without Cryptoapi support." + depends on TOI_CORE && !CRYPTO + + config TOI_CRYPTO + bool "Compression support" + depends on TOI_CORE && CRYPTO + default y + ---help--- + This option adds support for using cryptoapi compression + algorithms. Compression is particularly useful as it can + more than double your suspend and resume speed (depending + upon how well your image compresses). + + You probably want this, so say Y here. + + comment "No compression support available without Cryptoapi support." + depends on TOI_CORE && !CRYPTO + + config TOI_USERUI + bool "Userspace User Interface support" + depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE) + default y + ---help--- + This option enabled support for a userspace based user interface + to TuxOnIce, which allows you to have a nice display while suspending + and resuming, and also enables features such as pressing escape to + cancel a cycle or interactive debugging. + + config TOI_USERUI_DEFAULT_PATH + string "Default userui program location" + default "/usr/local/sbin/tuxoniceui_text" + depends on TOI_USERUI + ---help--- + This entry allows you to specify a default path to the userui binary. + + config TOI_DEFAULT_IMAGE_SIZE_LIMIT + int "Default image size limit" + range -2 65536 + default "-2" + depends on TOI_CORE + ---help--- + This entry allows you to specify a default image size limit. It can + be overridden at run-time using /sys/power/tuxonice/image_size_limit. + + config TOI_KEEP_IMAGE + bool "Allow Keep Image Mode" + depends on TOI_CORE + ---help--- + This option allows you to keep and image and reuse it. It is intended + __ONLY__ for use with systems where all filesystems are mounted read- + only (kiosks, for example). To use it, compile this option in and boot + normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend. + When you resume, the image will not be removed. You will be unable to turn + off swap partitions (assuming you are using the swap allocator), but future + suspends simply do a power-down. The image can be updated using the + kernel command line parameter suspend_act= to turn off the keep image + bit. Keep image mode is a little less user friendly on purpose - it + should not be used without thought! + + config TOI_INCREMENTAL + bool "Incremental Image Support" + depends on TOI_CORE && 64BIT && TOI_KEEP_IMAGE + default n + ---help--- + This option enables the work in progress toward using the dirty page + tracking to record changes to pages. It is hoped that + this will be an initial step toward implementing storing just + the differences between consecutive images, which will + increase the amount of storage needed for the image, but also + increase the speed at which writing an image occurs and + reduce the wear and tear on drives. + + At the moment, all that is implemented is the first step of keeping + an existing image and then comparing it to the contents in memory + (by setting /sys/power/tuxonice/verify_image to 1 and triggering a + (fake) resume) to see what the page change tracking should find to be + different. If you have verify_image set to 1, TuxOnIce will automatically + invalidate the old image when you next try to hibernate, so there's no + greater chance of disk corruption than normal. + + comment "No incremental image support available without Keep Image support." + depends on TOI_CORE && !TOI_KEEP_IMAGE && 64BIT + + config TOI_REPLACE_SWSUSP + bool "Replace swsusp by default" + default y + depends on TOI_CORE + ---help--- + TuxOnIce can replace swsusp. This option makes that the default state, + requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want + to use the vanilla kernel functionality. Note that your initrd/ramfs will + need to do this before trying to resume, too. + With overriding swsusp enabled, echoing disk to /sys/power/state will + start a TuxOnIce cycle. If resume= doesn't specify an allocator and both + the swap and file allocators are compiled in, the swap allocator will be + used by default. + + config TOI_IGNORE_LATE_INITCALL + bool "Wait for initrd/ramfs to run, by default" + default n + depends on TOI_CORE + ---help--- + When booting, TuxOnIce can check for an image and start to resume prior + to any initrd/ramfs running (via a late initcall). + + If you don't have an initrd/ramfs, this is what you want to happen - + otherwise you won't be able to safely resume. You should set this option + to 'No'. + + If, however, you want your initrd/ramfs to run anyway before resuming, + you need to tell TuxOnIce to ignore that earlier opportunity to resume. + This can be done either by using this compile time option, or by + overriding this option with the boot-time parameter toi_initramfs_resume_only=1. + + Note that if TuxOnIce can't resume at the earlier opportunity, the + value of this option won't matter - the initramfs/initrd (if any) will + run anyway. + + menuconfig TOI_CLUSTER + bool "Cluster support" + default n + depends on TOI_CORE && NET && BROKEN + ---help--- + Support for linking multiple machines in a cluster so that they suspend + and resume together. + + config TOI_DEFAULT_CLUSTER_INTERFACE + string "Default cluster interface" + depends on TOI_CLUSTER + ---help--- + The default interface on which to communicate with other nodes in + the cluster. + + If no value is set here, cluster support will be disabled by default. + + config TOI_DEFAULT_CLUSTER_KEY + string "Default cluster key" + default "Default" + depends on TOI_CLUSTER + ---help--- + The default key used by this node. All nodes in the same cluster + have the same key. Multiple clusters may coexist on the same lan + by using different values for this key. + + config TOI_CLUSTER_IMAGE_TIMEOUT + int "Timeout when checking for image" + default 15 + depends on TOI_CLUSTER + ---help--- + Timeout (seconds) before continuing to boot when waiting to see + whether other nodes might have an image. Set to -1 to wait + indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue + booting sooner than this timeout. + + config TOI_CLUSTER_WAIT_UNTIL_NODES + int "Nodes without image before continuing" + default 0 + depends on TOI_CLUSTER + ---help--- + When booting and no image is found, we wait to see if other nodes + have an image before continuing to boot. This value lets us + continue after seeing a certain number of nodes without an image, + instead of continuing to wait for the timeout. Set to 0 to only + use the timeout. + + config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE + string "Default pre-hibernate script" + depends on TOI_CLUSTER + ---help--- + The default script to be called when starting to hibernate. + + config TOI_DEFAULT_CLUSTER_POST_HIBERNATE + string "Default post-hibernate script" + depends on TOI_CLUSTER + ---help--- + The default script to be called after resuming from hibernation. + + config TOI_DEFAULT_WAIT + int "Default waiting time for emergency boot messages" + default "25" + range -1 32768 + depends on TOI_CORE + help + TuxOnIce can display warnings very early in the process of resuming, + if (for example) it appears that you have booted a kernel that doesn't + match an image on disk. It can then give you the opportunity to either + continue booting that kernel, or reboot the machine. This option can be + used to control how long to wait in such circumstances. -1 means wait + forever. 0 means don't wait at all (do the default action, which will + generally be to continue booting and remove the image). Values of 1 or + more indicate a number of seconds (up to 255) to wait before doing the + default. + + config TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE + int "Default extra pages allowance" + default "2000" + range 500 32768 + depends on TOI_CORE + help + This value controls the default for the allowance TuxOnIce makes for + drivers to allocate extra memory during the atomic copy. The default + value of 2000 will be okay in most cases. If you are using + DRI, the easiest way to find what value to use is to try to hibernate + and look at how many pages were actually needed in the sysfs entry + /sys/power/tuxonice/debug_info (first number on the last line), adding + a little extra because the value is not always the same. + + config TOI_CHECKSUM + bool "Checksum pageset2" + default n + depends on TOI_CORE + select CRYPTO + select CRYPTO_ALGAPI + select CRYPTO_MD4 + ---help--- + Adds support for checksumming pageset2 pages, to ensure you really get an + atomic copy. Since some filesystems (XFS especially) change metadata even + when there's no other activity, we need this to check for pages that have + been changed while we were saving the page cache. If your debugging output + always says no pages were resaved, you may be able to safely disable this + option. + +config TOI + bool + depends on TOI_CORE!=n + default y + +config TOI_ZRAM_SUPPORT + def_bool y + depends on TOI && ZRAM!=n + config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS diff -Nur linux-4.2/kernel/power/Makefile linux-4.2-pck/kernel/power/Makefile --- linux-4.2/kernel/power/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/power/Makefile 2015-09-08 11:20:32.460345634 -0300 @@ -1,6 +1,38 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG +tuxonice_core-y := tuxonice_modules.o + +obj-$(CONFIG_TOI) += tuxonice_builtin.o +obj-$(CONFIG_TOI_INCREMENTAL) += tuxonice_incremental.o \ + tuxonice_copy_before_write.o + +tuxonice_core-$(CONFIG_PM_DEBUG) += tuxonice_alloc.o + +# Compile these in after allocation debugging, if used. + +tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \ + tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \ + tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \ + tuxonice_power_off.o tuxonice_atomic_copy.o + +tuxonice_core-$(CONFIG_TOI_CHECKSUM) += tuxonice_checksum.o + +tuxonice_core-$(CONFIG_NET) += tuxonice_storage.o tuxonice_netlink.o + +obj-$(CONFIG_TOI_CORE) += tuxonice_core.o +obj-$(CONFIG_TOI_PRUNE) += tuxonice_prune.o +obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o + +tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \ + tuxonice_bio_signature.o + +obj-$(CONFIG_TOI_SWAP) += tuxonice_bio.o tuxonice_swap.o +obj-$(CONFIG_TOI_FILE) += tuxonice_bio.o tuxonice_file.o +obj-$(CONFIG_TOI_CLUSTER) += tuxonice_cluster.o + +obj-$(CONFIG_TOI_USERUI) += tuxonice_userui.o + obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o diff -Nur linux-4.2/kernel/power/hibernate.c linux-4.2-pck/kernel/power/hibernate.c --- linux-4.2/kernel/power/hibernate.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/power/hibernate.c 2015-09-08 11:20:32.460345634 -0300 @@ -31,7 +31,7 @@ #include #include -#include "power.h" +#include "tuxonice.h" static int nocompress; @@ -39,7 +39,7 @@ static int nohibernate; static int resume_wait; static unsigned int resume_delay; -static char resume_file[256] = CONFIG_PM_STD_PARTITION; +char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; __visible int in_suspend __nosavedata; @@ -123,7 +123,7 @@ * platform_begin - Call platform to start hibernation. * @platform_mode: Whether or not to use the platform driver. */ -static int platform_begin(int platform_mode) +int platform_begin(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->begin() : 0; @@ -133,7 +133,7 @@ * platform_end - Call platform to finish transition to the working state. * @platform_mode: Whether or not to use the platform driver. */ -static void platform_end(int platform_mode) +void platform_end(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->end(); @@ -147,7 +147,7 @@ * if so configured, and return an error code if that fails. */ -static int platform_pre_snapshot(int platform_mode) +int platform_pre_snapshot(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->pre_snapshot() : 0; @@ -162,7 +162,7 @@ * * This routine is called on one CPU with interrupts disabled. */ -static void platform_leave(int platform_mode) +void platform_leave(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->leave(); @@ -177,7 +177,7 @@ * * This routine must be called after platform_prepare(). */ -static void platform_finish(int platform_mode) +void platform_finish(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->finish(); @@ -193,7 +193,7 @@ * If the restore fails after this function has been called, * platform_restore_cleanup() must be called. */ -static int platform_pre_restore(int platform_mode) +int platform_pre_restore(int platform_mode) { return (platform_mode && hibernation_ops) ? hibernation_ops->pre_restore() : 0; @@ -210,7 +210,7 @@ * function must be called too, regardless of the result of * platform_pre_restore(). */ -static void platform_restore_cleanup(int platform_mode) +void platform_restore_cleanup(int platform_mode) { if (platform_mode && hibernation_ops) hibernation_ops->restore_cleanup(); @@ -220,7 +220,7 @@ * platform_recover - Recover from a failure to suspend devices. * @platform_mode: Whether or not to use the platform driver. */ -static void platform_recover(int platform_mode) +void platform_recover(int platform_mode) { if (platform_mode && hibernation_ops && hibernation_ops->recover) hibernation_ops->recover(); @@ -648,6 +648,9 @@ { int error; + if (test_action_state(TOI_REPLACE_SWSUSP)) + return try_tuxonice_hibernate(); + if (!hibernation_available()) { pr_debug("PM: Hibernation not available.\n"); return -EPERM; @@ -737,11 +740,19 @@ * attempts to recover gracefully and make the kernel return to the normal mode * of operation. */ -static int software_resume(void) +int software_resume(void) { int error; unsigned int flags; + resume_attempted = 1; + + /* + * We can't know (until an image header - if any - is loaded), whether + * we did override swsusp. We therefore ensure that both are tried. + */ + try_tuxonice_resume(); + /* * If the user said "noresume".. bail out early. */ @@ -1128,6 +1139,7 @@ static int __init noresume_setup(char *str) { noresume = 1; + set_toi_state(TOI_NORESUME_SPECIFIED); return 1; } diff -Nur linux-4.2/kernel/power/power.h linux-4.2-pck/kernel/power/power.h --- linux-4.2/kernel/power/power.h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/power/power.h 2015-09-08 11:20:32.460345634 -0300 @@ -36,8 +36,12 @@ return arch_hibernation_header_restore(info) ? "architecture specific data" : NULL; } +#else +extern char *check_image_kernel(struct swsusp_info *info); #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ +extern int init_header(struct swsusp_info *info); +extern char resume_file[256]; /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?] @@ -77,6 +81,8 @@ .store = _name##_store, \ } +extern struct pbe *restore_pblist; + /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ @@ -260,6 +266,31 @@ } #endif +extern struct page *saveable_page(struct zone *z, unsigned long p); +#ifdef CONFIG_HIGHMEM +struct page *saveable_highmem_page(struct zone *z, unsigned long p); +#else +static +inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} +#endif + +#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe)) +extern struct list_head nosave_regions; + +/** + * This structure represents a range of page frames the contents of which + * should not be saved during the suspend. + */ + +struct nosave_region { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; +}; + #ifdef CONFIG_PM_AUTOSLEEP /* kernel/power/autosleep.c */ @@ -286,3 +317,10 @@ extern int pm_wake_unlock(const char *buf); #endif /* !CONFIG_PM_WAKELOCKS */ + +#ifdef CONFIG_TOI +unsigned long toi_get_nonconflicting_page(void); +#define BM_END_OF_MAP (~0UL) +#else +#define toi_get_nonconflicting_page() (0) +#endif diff -Nur linux-4.2/kernel/power/snapshot.c linux-4.2-pck/kernel/power/snapshot.c --- linux-4.2/kernel/power/snapshot.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/power/snapshot.c 2015-09-08 11:20:32.460345634 -0300 @@ -36,6 +36,9 @@ #include #include +#include "tuxonice_modules.h" +#include "tuxonice_builtin.h" +#include "tuxonice_alloc.h" #include "power.h" static int swsusp_page_is_free(struct page *); @@ -98,6 +101,9 @@ { void *res; + if (toi_running) + return (void *) toi_get_nonconflicting_page(); + res = (void *)get_zeroed_page(gfp_mask); if (safe_needed) while (res && swsusp_page_is_free(virt_to_page(res))) { @@ -143,6 +149,11 @@ page = virt_to_page(addr); + if (toi_running) { + toi__free_page(29, page); + return; + } + swsusp_unset_page_forbidden(page); if (clear_nosave_free) swsusp_unset_page_free(page); @@ -302,13 +313,15 @@ int node_bit; }; +#define BM_POSITION_SLOTS (NR_CPUS * 2) + struct memory_bitmap { struct list_head zones; struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects */ - struct bm_position cur; /* most recently used bit position */ + struct bm_position cur[BM_POSITION_SLOTS]; /* most recently used bit position */ }; /* Functions that operate on memory bitmaps */ @@ -473,16 +486,39 @@ free_image_page(node->data, clear_nosave_free); } -static void memory_bm_position_reset(struct memory_bitmap *bm) +void memory_bm_position_reset(struct memory_bitmap *bm) { - bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, + int index; + + for (index = 0; index < BM_POSITION_SLOTS; index++) { + bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, list); - bm->cur.node = list_entry(bm->cur.zone->leaves.next, + bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next, struct rtree_node, list); - bm->cur.node_pfn = 0; - bm->cur.node_bit = 0; + bm->cur[index].node_pfn = 0; + bm->cur[index].node_bit = 0; + } } +static void memory_bm_clear_current(struct memory_bitmap *bm, int index); +unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index); + +/** + * memory_bm_clear + * @param bm - The bitmap to clear + * + * Only run while single threaded - locking not needed + */ +void memory_bm_clear(struct memory_bitmap *bm) +{ + memory_bm_position_reset(bm); + + while (memory_bm_next_pfn(bm, 0) != BM_END_OF_MAP) { + memory_bm_clear_current(bm, 0); + } + + memory_bm_position_reset(bm); +} static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); struct mem_extent { @@ -595,7 +631,8 @@ } bm->p_list = ca.chain; - memory_bm_position_reset(bm); + + memory_bm_position_reset(bm); Exit: free_mem_extents(&mem_extents); return error; @@ -631,14 +668,24 @@ * It walks the radix tree to find the page which contains the bit for * pfn and returns the bit position in **addr and *bit_nr. */ -static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, - void **addr, unsigned int *bit_nr) +int memory_bm_find_bit(struct memory_bitmap *bm, int index, + unsigned long pfn, void **addr, unsigned int *bit_nr) { struct mem_zone_bm_rtree *curr, *zone; struct rtree_node *node; int i, block_nr; - zone = bm->cur.zone; + if (!bm->cur[index].zone) { + // Reset + bm->cur[index].zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, + list); + bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next, + struct rtree_node, list); + bm->cur[index].node_pfn = 0; + bm->cur[index].node_bit = 0; + } + + zone = bm->cur[index].zone; if (pfn >= zone->start_pfn && pfn < zone->end_pfn) goto zone_found; @@ -662,8 +709,8 @@ * node for our pfn. */ - node = bm->cur.node; - if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + node = bm->cur[index].node; + if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur[index].node_pfn) goto node_found; node = zone->rtree; @@ -680,9 +727,9 @@ node_found: /* Update last position */ - bm->cur.zone = zone; - bm->cur.node = node; - bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + bm->cur[index].zone = zone; + bm->cur[index].node = node; + bm->cur[index].node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; /* Set return values */ *addr = node->data; @@ -691,66 +738,66 @@ return 0; } -static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) +void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); BUG_ON(error); set_bit(bit, addr); } -static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) +int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); if (!error) set_bit(bit, addr); return error; } -static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) +void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); BUG_ON(error); clear_bit(bit, addr); } -static void memory_bm_clear_current(struct memory_bitmap *bm) +static void memory_bm_clear_current(struct memory_bitmap *bm, int index) { int bit; - bit = max(bm->cur.node_bit - 1, 0); - clear_bit(bit, bm->cur.node->data); + bit = max(bm->cur[index].node_bit - 1, 0); + clear_bit(bit, bm->cur[index].node->data); } -static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) +int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn) { void *addr; unsigned int bit; int error; - error = memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, index, pfn, &addr, &bit); BUG_ON(error); return test_bit(bit, addr); } -static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) +static bool memory_bm_pfn_present(struct memory_bitmap *bm, int index, unsigned long pfn) { void *addr; unsigned int bit; - return !memory_bm_find_bit(bm, pfn, &addr, &bit); + return !memory_bm_find_bit(bm, index, pfn, &addr, &bit); } /* @@ -763,25 +810,25 @@ * * Returns true if there is a next node, false otherwise. */ -static bool rtree_next_node(struct memory_bitmap *bm) +static bool rtree_next_node(struct memory_bitmap *bm, int index) { - bm->cur.node = list_entry(bm->cur.node->list.next, + bm->cur[index].node = list_entry(bm->cur[index].node->list.next, struct rtree_node, list); - if (&bm->cur.node->list != &bm->cur.zone->leaves) { - bm->cur.node_pfn += BM_BITS_PER_BLOCK; - bm->cur.node_bit = 0; + if (&bm->cur[index].node->list != &bm->cur[index].zone->leaves) { + bm->cur[index].node_pfn += BM_BITS_PER_BLOCK; + bm->cur[index].node_bit = 0; touch_softlockup_watchdog(); return true; } /* No more nodes, goto next zone */ - bm->cur.zone = list_entry(bm->cur.zone->list.next, + bm->cur[index].zone = list_entry(bm->cur[index].zone->list.next, struct mem_zone_bm_rtree, list); - if (&bm->cur.zone->list != &bm->zones) { - bm->cur.node = list_entry(bm->cur.zone->leaves.next, + if (&bm->cur[index].zone->list != &bm->zones) { + bm->cur[index].node = list_entry(bm->cur[index].zone->leaves.next, struct rtree_node, list); - bm->cur.node_pfn = 0; - bm->cur.node_bit = 0; + bm->cur[index].node_pfn = 0; + bm->cur[index].node_bit = 0; return true; } @@ -799,38 +846,29 @@ * It is required to run memory_bm_position_reset() before the * first call to this function. */ -static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) +unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index) { unsigned long bits, pfn, pages; int bit; + index += NR_CPUS; /* Iteration state is separated from get/set/test */ + do { - pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; - bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); - bit = find_next_bit(bm->cur.node->data, bits, - bm->cur.node_bit); + pages = bm->cur[index].zone->end_pfn - bm->cur[index].zone->start_pfn; + bits = min(pages - bm->cur[index].node_pfn, BM_BITS_PER_BLOCK); + bit = find_next_bit(bm->cur[index].node->data, bits, + bm->cur[index].node_bit); if (bit < bits) { - pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; - bm->cur.node_bit = bit + 1; + pfn = bm->cur[index].zone->start_pfn + bm->cur[index].node_pfn + bit; + bm->cur[index].node_bit = bit + 1; return pfn; } - } while (rtree_next_node(bm)); + } while (rtree_next_node(bm, index)); return BM_END_OF_MAP; } -/** - * This structure represents a range of page frames the contents of which - * should not be saved during the suspend. - */ - -struct nosave_region { - struct list_head list; - unsigned long start_pfn; - unsigned long end_pfn; -}; - -static LIST_HEAD(nosave_regions); +LIST_HEAD(nosave_regions); /** * register_nosave_region - register a range of page frames the contents @@ -889,37 +927,37 @@ void swsusp_set_page_free(struct page *page) { if (free_pages_map) - memory_bm_set_bit(free_pages_map, page_to_pfn(page)); + memory_bm_set_bit(free_pages_map, 0, page_to_pfn(page)); } static int swsusp_page_is_free(struct page *page) { return free_pages_map ? - memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0; + memory_bm_test_bit(free_pages_map, 0, page_to_pfn(page)) : 0; } void swsusp_unset_page_free(struct page *page) { if (free_pages_map) - memory_bm_clear_bit(free_pages_map, page_to_pfn(page)); + memory_bm_clear_bit(free_pages_map, 0, page_to_pfn(page)); } static void swsusp_set_page_forbidden(struct page *page) { if (forbidden_pages_map) - memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page)); + memory_bm_set_bit(forbidden_pages_map, 0, page_to_pfn(page)); } int swsusp_page_is_forbidden(struct page *page) { return forbidden_pages_map ? - memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0; + memory_bm_test_bit(forbidden_pages_map, 0, page_to_pfn(page)) : 0; } static void swsusp_unset_page_forbidden(struct page *page) { if (forbidden_pages_map) - memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page)); + memory_bm_clear_bit(forbidden_pages_map, 0, page_to_pfn(page)); } /** @@ -950,7 +988,7 @@ * touch the PFNs for which the error is * returned anyway. */ - mem_bm_set_bit_check(bm, pfn); + mem_bm_set_bit_check(bm, 0, pfn); } } } @@ -1078,7 +1116,7 @@ * We should save the page if it isn't Nosave or NosaveFree, or Reserved, * and it isn't a part of a free chunk of pages. */ -static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) +struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -1125,11 +1163,6 @@ } return n; } -#else -static inline void *saveable_highmem_page(struct zone *z, unsigned long p) -{ - return NULL; -} #endif /* CONFIG_HIGHMEM */ /** @@ -1140,7 +1173,7 @@ * of pages statically defined as 'unsaveable', and it isn't a part of * a free chunk of pages. */ -static struct page *saveable_page(struct zone *zone, unsigned long pfn) +struct page *saveable_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -1278,15 +1311,15 @@ max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (page_is_saveable(zone, pfn)) - memory_bm_set_bit(orig_bm, pfn); + memory_bm_set_bit(orig_bm, 0, pfn); } memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); for(;;) { - pfn = memory_bm_next_pfn(orig_bm); + pfn = memory_bm_next_pfn(orig_bm, 0); if (unlikely(pfn == BM_END_OF_MAP)) break; - copy_data_page(memory_bm_next_pfn(copy_bm), pfn); + copy_data_page(memory_bm_next_pfn(copy_bm, 0), pfn); } } @@ -1332,8 +1365,8 @@ memory_bm_position_reset(free_pages_map); loop: - fr_pfn = memory_bm_next_pfn(free_pages_map); - fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + fr_pfn = memory_bm_next_pfn(free_pages_map, 0); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0); /* * Find the next bit set in both bitmaps. This is guaranteed to @@ -1341,16 +1374,16 @@ */ do { if (fb_pfn < fr_pfn) - fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map, 0); if (fr_pfn < fb_pfn) - fr_pfn = memory_bm_next_pfn(free_pages_map); + fr_pfn = memory_bm_next_pfn(free_pages_map, 0); } while (fb_pfn != fr_pfn); if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { struct page *page = pfn_to_page(fr_pfn); - memory_bm_clear_current(forbidden_pages_map); - memory_bm_clear_current(free_pages_map); + memory_bm_clear_current(forbidden_pages_map, 0); + memory_bm_clear_current(free_pages_map, 0); __free_page(page); goto loop; } @@ -1385,7 +1418,7 @@ page = alloc_image_page(mask); if (!page) break; - memory_bm_set_bit(©_bm, page_to_pfn(page)); + memory_bm_set_bit(©_bm, 0, page_to_pfn(page)); if (PageHighMem(page)) alloc_highmem++; else @@ -1481,7 +1514,7 @@ memory_bm_position_reset(©_bm); while (to_free_normal > 0 || to_free_highmem > 0) { - unsigned long pfn = memory_bm_next_pfn(©_bm); + unsigned long pfn = memory_bm_next_pfn(©_bm, 0); struct page *page = pfn_to_page(pfn); if (PageHighMem(page)) { @@ -1495,7 +1528,7 @@ to_free_normal--; alloc_normal--; } - memory_bm_clear_bit(©_bm, pfn); + memory_bm_clear_bit(©_bm, 0, pfn); swsusp_unset_page_forbidden(page); swsusp_unset_page_free(page); __free_page(page); @@ -1780,7 +1813,7 @@ struct page *page; page = alloc_image_page(__GFP_HIGHMEM); - memory_bm_set_bit(bm, page_to_pfn(page)); + memory_bm_set_bit(bm, 0, page_to_pfn(page)); } return nr_highmem; } @@ -1823,7 +1856,7 @@ page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); if (!page) goto err_out; - memory_bm_set_bit(copy_bm, page_to_pfn(page)); + memory_bm_set_bit(copy_bm, 0, page_to_pfn(page)); } } @@ -1838,6 +1871,9 @@ { unsigned int nr_pages, nr_highmem; + if (toi_running) + return toi_post_context_save(); + printk(KERN_INFO "PM: Creating hibernation image:\n"); drain_local_pages(NULL); @@ -1885,7 +1921,7 @@ return 0; } -static char *check_image_kernel(struct swsusp_info *info) +char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; @@ -1906,7 +1942,7 @@ return nr_copy_pages + nr_meta_pages + 1; } -static int init_header(struct swsusp_info *info) +int init_header(struct swsusp_info *info) { memset(info, 0, sizeof(struct swsusp_info)); info->num_physpages = get_num_physpages(); @@ -1928,7 +1964,7 @@ int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { - buf[j] = memory_bm_next_pfn(bm); + buf[j] = memory_bm_next_pfn(bm, 0); if (unlikely(buf[j] == BM_END_OF_MAP)) break; /* Save page key for data page (s390 only). */ @@ -1979,7 +2015,7 @@ } else { struct page *page; - page = pfn_to_page(memory_bm_next_pfn(©_bm)); + page = pfn_to_page(memory_bm_next_pfn(©_bm, 0)); if (PageHighMem(page)) { /* Highmem pages are copied to the buffer, * because we can't return with a kmapped @@ -2021,7 +2057,7 @@ /* Mark pages that correspond to the "original" pfns as "unsafe" */ memory_bm_position_reset(bm); do { - pfn = memory_bm_next_pfn(bm); + pfn = memory_bm_next_pfn(bm, 0); if (likely(pfn != BM_END_OF_MAP)) { if (likely(pfn_valid(pfn))) swsusp_set_page_free(pfn_to_page(pfn)); @@ -2041,10 +2077,10 @@ unsigned long pfn; memory_bm_position_reset(src); - pfn = memory_bm_next_pfn(src); + pfn = memory_bm_next_pfn(src, 0); while (pfn != BM_END_OF_MAP) { - memory_bm_set_bit(dst, pfn); - pfn = memory_bm_next_pfn(src); + memory_bm_set_bit(dst, 0, pfn); + pfn = memory_bm_next_pfn(src, 0); } } @@ -2095,8 +2131,8 @@ /* Extract and buffer page key for data page (s390 only). */ page_key_memorize(buf + j); - if (memory_bm_pfn_present(bm, buf[j])) - memory_bm_set_bit(bm, buf[j]); + if (memory_bm_pfn_present(bm, 0, buf[j])) + memory_bm_set_bit(bm, 0, buf[j]); else return -EFAULT; } @@ -2139,12 +2175,12 @@ unsigned int cnt = 0; memory_bm_position_reset(bm); - pfn = memory_bm_next_pfn(bm); + pfn = memory_bm_next_pfn(bm, 0); while (pfn != BM_END_OF_MAP) { if (PageHighMem(pfn_to_page(pfn))) cnt++; - pfn = memory_bm_next_pfn(bm); + pfn = memory_bm_next_pfn(bm, 0); } return cnt; } @@ -2189,7 +2225,7 @@ page = alloc_page(__GFP_HIGHMEM); if (!swsusp_page_is_free(page)) { /* The page is "safe", set its bit the bitmap */ - memory_bm_set_bit(bm, page_to_pfn(page)); + memory_bm_set_bit(bm, 0, page_to_pfn(page)); safe_highmem_pages++; } /* Mark the page as allocated */ @@ -2247,7 +2283,7 @@ /* Copy of the page will be stored in high memory */ kaddr = buffer; - tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); + tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm, 0)); safe_highmem_pages--; last_highmem_page = tmp; pbe->copy_page = tmp; @@ -2418,7 +2454,7 @@ { struct pbe *pbe; struct page *page; - unsigned long pfn = memory_bm_next_pfn(bm); + unsigned long pfn = memory_bm_next_pfn(bm, 0); if (pfn == BM_END_OF_MAP) return ERR_PTR(-EFAULT); @@ -2605,3 +2641,82 @@ return 0; } #endif /* CONFIG_HIGHMEM */ + +struct memory_bitmap *pageset1_map, *pageset2_map, *free_map, *nosave_map, + *pageset1_copy_map, *io_map, *page_resave_map, *compare_map; + +int resume_attempted; + +int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk) + (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)) +{ + int result; + + memory_bm_position_reset(bm); + + do { + result = rw_chunk(WRITE, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE); + + if (result) + return result; + } while (rtree_next_node(bm, 0)); + return 0; +} + +int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk) + (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)) +{ + int result; + + memory_bm_position_reset(bm); + + do { + result = rw_chunk(READ, NULL, (char *) bm->cur[0].node->data, PAGE_SIZE); + + if (result) + return result; + + } while (rtree_next_node(bm, 0)); + return 0; +} + +int memory_bm_space_needed(struct memory_bitmap *bm) +{ + unsigned long bytes = 0; + + memory_bm_position_reset(bm); + do { + bytes += PAGE_SIZE; + } while (rtree_next_node(bm, 0)); + return bytes; +} + +int toi_alloc_bitmap(struct memory_bitmap **bm) +{ + int error; + struct memory_bitmap *bm1; + + bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); + if (!bm1) + return -ENOMEM; + + error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY); + if (error) { + printk("Error returned - %d.\n", error); + kfree(bm1); + return -ENOMEM; + } + + *bm = bm1; + return 0; +} + +void toi_free_bitmap(struct memory_bitmap **bm) +{ + if (!*bm) + return; + + memory_bm_free(*bm, 0); + kfree(*bm); + *bm = NULL; +} diff -Nur linux-4.2/kernel/power/tuxonice.h linux-4.2-pck/kernel/power/tuxonice.h --- linux-4.2/kernel/power/tuxonice.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice.h 2015-09-08 11:20:32.460345634 -0300 @@ -0,0 +1,260 @@ +/* + * kernel/power/tuxonice.h + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * It contains declarations used throughout swsusp. + * + */ + +#ifndef KERNEL_POWER_TOI_H +#define KERNEL_POWER_TOI_H + +#include +#include +#include +#include +#include +#include "tuxonice_pageflags.h" +#include "power.h" + +#define TOI_CORE_VERSION "3.3" +#define TOI_HEADER_VERSION 3 +#define MY_BOOT_KERNEL_DATA_VERSION 4 + +struct toi_boot_kernel_data { + int version; + int size; + unsigned long toi_action; + unsigned long toi_debug_state; + u32 toi_default_console_level; + int toi_io_time[2][2]; + char toi_nosave_commandline[COMMAND_LINE_SIZE]; + unsigned long pages_used[33]; + unsigned long incremental_bytes_in; + unsigned long incremental_bytes_out; + unsigned long compress_bytes_in; + unsigned long compress_bytes_out; + unsigned long pruned_pages; +}; + +extern struct toi_boot_kernel_data toi_bkd; + +/* Location of book kernel data struct in kernel being resumed */ +extern unsigned long boot_kernel_data_buffer; + +/* == Action states == */ + +enum { + TOI_REBOOT, + TOI_PAUSE, + TOI_LOGALL, + TOI_CAN_CANCEL, + TOI_KEEP_IMAGE, + TOI_FREEZER_TEST, + TOI_SINGLESTEP, + TOI_PAUSE_NEAR_PAGESET_END, + TOI_TEST_FILTER_SPEED, + TOI_TEST_BIO, + TOI_NO_PAGESET2, + TOI_IGNORE_ROOTFS, + TOI_REPLACE_SWSUSP, + TOI_PAGESET2_FULL, + TOI_ABORT_ON_RESAVE_NEEDED, + TOI_NO_MULTITHREADED_IO, + TOI_NO_DIRECT_LOAD, /* Obsolete */ + TOI_LATE_CPU_HOTPLUG, /* Obsolete */ + TOI_GET_MAX_MEM_ALLOCD, + TOI_NO_FLUSHER_THREAD, + TOI_NO_PS2_IF_UNNEEDED, + TOI_POST_RESUME_BREAKPOINT, + TOI_NO_READAHEAD, + TOI_TRACE_DEBUG_ON, + TOI_INCREMENTAL_IMAGE, +}; + +extern unsigned long toi_bootflags_mask; + +#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action)) + +/* == Result states == */ + +enum { + TOI_ABORTED, + TOI_ABORT_REQUESTED, + TOI_NOSTORAGE_AVAILABLE, + TOI_INSUFFICIENT_STORAGE, + TOI_FREEZING_FAILED, + TOI_KEPT_IMAGE, + TOI_WOULD_EAT_MEMORY, + TOI_UNABLE_TO_FREE_ENOUGH_MEMORY, + TOI_PM_SEM, + TOI_DEVICE_REFUSED, + TOI_SYSDEV_REFUSED, + TOI_EXTRA_PAGES_ALLOW_TOO_SMALL, + TOI_UNABLE_TO_PREPARE_IMAGE, + TOI_FAILED_MODULE_INIT, + TOI_FAILED_MODULE_CLEANUP, + TOI_FAILED_IO, + TOI_OUT_OF_MEMORY, + TOI_IMAGE_ERROR, + TOI_PLATFORM_PREP_FAILED, + TOI_CPU_HOTPLUG_FAILED, + TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */ + TOI_RESAVE_NEEDED, + TOI_CANT_SUSPEND, + TOI_NOTIFIERS_PREPARE_FAILED, + TOI_PRE_SNAPSHOT_FAILED, + TOI_PRE_RESTORE_FAILED, + TOI_USERMODE_HELPERS_ERR, + TOI_CANT_USE_ALT_RESUME, + TOI_HEADER_TOO_BIG, + TOI_WAKEUP_EVENT, + TOI_SYSCORE_REFUSED, + TOI_DPM_PREPARE_FAILED, + TOI_DPM_SUSPEND_FAILED, + TOI_NUM_RESULT_STATES /* Used in printing debug info only */ +}; + +extern unsigned long toi_result; + +#define set_result_state(bit) (test_and_set_bit(bit, &toi_result)) +#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \ + test_and_set_bit(bit, &toi_result)) +#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result)) +#define test_result_state(bit) (test_bit(bit, &toi_result)) + +/* == Debug sections and levels == */ + +/* debugging levels. */ +enum { + TOI_STATUS = 0, + TOI_ERROR = 2, + TOI_LOW, + TOI_MEDIUM, + TOI_HIGH, + TOI_VERBOSE, +}; + +enum { + TOI_ANY_SECTION, + TOI_EAT_MEMORY, + TOI_IO, + TOI_HEADER, + TOI_WRITER, + TOI_MEMORY, + TOI_PAGEDIR, + TOI_COMPRESS, + TOI_BIO, +}; + +#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state)) +#define clear_debug_state(bit) \ + (test_and_clear_bit(bit, &toi_bkd.toi_debug_state)) +#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state)) + +/* == Steps in hibernating == */ + +enum { + STEP_HIBERNATE_PREPARE_IMAGE, + STEP_HIBERNATE_SAVE_IMAGE, + STEP_HIBERNATE_POWERDOWN, + STEP_RESUME_CAN_RESUME, + STEP_RESUME_LOAD_PS1, + STEP_RESUME_DO_RESTORE, + STEP_RESUME_READ_PS2, + STEP_RESUME_GO, + STEP_RESUME_ALT_IMAGE, + STEP_CLEANUP, + STEP_QUIET_CLEANUP +}; + +/* == TuxOnIce states == + (see also include/linux/suspend.h) */ + +#define get_toi_state() (toi_state) +#define restore_toi_state(saved_state) \ + do { toi_state = saved_state; } while (0) + +/* == Module support == */ + +struct toi_core_fns { + int (*post_context_save)(void); + unsigned long (*get_nonconflicting_page)(void); + int (*try_hibernate)(void); + void (*try_resume)(void); +}; + +extern struct toi_core_fns *toi_core_fns; + +/* == All else == */ +#define KB(x) ((x) << (PAGE_SHIFT - 10)) +#define MB(x) ((x) >> (20 - PAGE_SHIFT)) + +extern int toi_start_anything(int toi_or_resume); +extern void toi_finish_anything(int toi_or_resume); + +extern int save_image_part1(void); +extern int toi_atomic_restore(void); + +extern int toi_try_hibernate(void); +extern void toi_try_resume(void); + +extern int __toi_post_context_save(void); + +extern unsigned int nr_hibernates; +extern char alt_resume_param[256]; + +extern void copyback_post(void); +extern int toi_hibernate(void); +extern unsigned long extra_pd1_pages_used; + +#define SECTOR_SIZE 512 + +extern void toi_early_boot_message(int can_erase_image, int default_answer, + char *warning_reason, ...); + +extern int do_check_can_resume(void); +extern int do_toi_step(int step); +extern int toi_launch_userspace_program(char *command, int channel_no, + int wait, int debug); + +extern char tuxonice_signature[9]; + +extern int toi_start_other_threads(void); +extern void toi_stop_other_threads(void); + +extern int toi_trace_index; +#define TOI_TRACE_DEBUG(PFN, DESC, ...) \ + do { \ + if (test_action_state(TOI_TRACE_DEBUG_ON)) { \ + printk("*TOI* %ld %02d" DESC "\n", PFN, toi_trace_index, ##__VA_ARGS__); \ + } \ + } while(0) + +#ifdef CONFIG_TOI_KEEP_IMAGE +#define toi_keeping_image (test_action_state(TOI_KEEP_IMAGE) || test_action_state(TOI_INCREMENTAL_IMAGE)) +#else +#define toi_keeping_image (0) +#endif + +#ifdef CONFIG_TOI_INCREMENTAL +extern void toi_reset_dirtiness_one(unsigned long pfn, int verbose); +extern int toi_reset_dirtiness(int verbose); +extern void toi_cbw_write(void); +extern void toi_cbw_restore(void); +extern int toi_allocate_cbw_data(void); +extern void toi_free_cbw_data(void); +extern int toi_cbw_init(void); +extern void toi_mark_tasks_cbw(void); +#else +static inline int toi_reset_dirtiness(int verbose) { return 0; } +#define toi_cbw_write() do { } while(0) +#define toi_cbw_restore() do { } while(0) +#define toi_allocate_cbw_data() do { } while(0) +#define toi_free_cbw_data() do { } while(0) +static inline int toi_cbw_init(void) { return 0; } +#endif +#endif diff -Nur linux-4.2/kernel/power/tuxonice_alloc.c linux-4.2-pck/kernel/power/tuxonice_alloc.c --- linux-4.2/kernel/power/tuxonice_alloc.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_alloc.c 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,308 @@ +/* + * kernel/power/tuxonice_alloc.c + * + * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + */ + +#include +#include +#include "tuxonice_modules.h" +#include "tuxonice_alloc.h" +#include "tuxonice_sysfs.h" +#include "tuxonice.h" + +#define TOI_ALLOC_PATHS 41 + +static DEFINE_MUTEX(toi_alloc_mutex); + +static struct toi_module_ops toi_alloc_ops; + +static int toi_fail_num; + +static atomic_t toi_alloc_count[TOI_ALLOC_PATHS], + toi_free_count[TOI_ALLOC_PATHS], + toi_test_count[TOI_ALLOC_PATHS], + toi_fail_count[TOI_ALLOC_PATHS]; +static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS]; +static int cur_allocd, max_allocd; + +static char *toi_alloc_desc[TOI_ALLOC_PATHS] = { + "", /* 0 */ + "get_io_info_struct", + "extent", + "extent (loading chain)", + "userui channel", + "userui arg", /* 5 */ + "attention list metadata", + "extra pagedir memory metadata", + "bdev metadata", + "extra pagedir memory", + "header_locations_read", /* 10 */ + "bio queue", + "prepare_readahead", + "i/o buffer", + "writer buffer in bio_init", + "checksum buffer", /* 15 */ + "compression buffer", + "filewriter signature op", + "set resume param alloc1", + "set resume param alloc2", + "debugging info buffer", /* 20 */ + "check can resume buffer", + "write module config buffer", + "read module config buffer", + "write image header buffer", + "read pageset1 buffer", /* 25 */ + "get_have_image_data buffer", + "checksum page", + "worker rw loop", + "get nonconflicting page", + "ps1 load addresses", /* 30 */ + "remove swap image", + "swap image exists", + "swap parse sig location", + "sysfs kobj", + "swap mark resume attempted buffer", /* 35 */ + "cluster member", + "boot kernel data buffer", + "setting swap signature", + "block i/o bdev struct", + "copy before write", /* 40 */ +}; + +#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \ + do { \ + BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \ + \ + if (FAIL_NUM == toi_fail_num) { \ + atomic_inc(&toi_test_count[FAIL_NUM]); \ + toi_fail_num = 0; \ + return FAIL_VAL; \ + } \ + } while (0) + +static void alloc_update_stats(int fail_num, void *result, int size) +{ + if (!result) { + atomic_inc(&toi_fail_count[fail_num]); + return; + } + + atomic_inc(&toi_alloc_count[fail_num]); + if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { + mutex_lock(&toi_alloc_mutex); + toi_cur_allocd[fail_num]++; + cur_allocd += size; + if (unlikely(cur_allocd > max_allocd)) { + int i; + + for (i = 0; i < TOI_ALLOC_PATHS; i++) + toi_max_allocd[i] = toi_cur_allocd[i]; + max_allocd = cur_allocd; + } + mutex_unlock(&toi_alloc_mutex); + } +} + +static void free_update_stats(int fail_num, int size) +{ + BUG_ON(fail_num >= TOI_ALLOC_PATHS); + atomic_inc(&toi_free_count[fail_num]); + if (unlikely(atomic_read(&toi_free_count[fail_num]) > + atomic_read(&toi_alloc_count[fail_num]))) + dump_stack(); + if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { + mutex_lock(&toi_alloc_mutex); + cur_allocd -= size; + toi_cur_allocd[fail_num]--; + mutex_unlock(&toi_alloc_mutex); + } +} + +void *toi_kzalloc(int fail_num, size_t size, gfp_t flags) +{ + void *result; + + if (toi_alloc_ops.enabled) + MIGHT_FAIL(fail_num, NULL); + result = kzalloc(size, flags); + if (toi_alloc_ops.enabled) + alloc_update_stats(fail_num, result, size); + if (fail_num == toi_trace_allocs) + dump_stack(); + return result; +} + +unsigned long toi_get_free_pages(int fail_num, gfp_t mask, + unsigned int order) +{ + unsigned long result; + + mask |= ___GFP_TOI_NOTRACK; + if (toi_alloc_ops.enabled) + MIGHT_FAIL(fail_num, 0); + result = __get_free_pages(mask, order); + if (toi_alloc_ops.enabled) + alloc_update_stats(fail_num, (void *) result, + PAGE_SIZE << order); + if (fail_num == toi_trace_allocs) + dump_stack(); + return result; +} + +struct page *toi_alloc_page(int fail_num, gfp_t mask) +{ + struct page *result; + + if (toi_alloc_ops.enabled) + MIGHT_FAIL(fail_num, NULL); + mask |= ___GFP_TOI_NOTRACK; + result = alloc_page(mask); + if (toi_alloc_ops.enabled) + alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); + if (fail_num == toi_trace_allocs) + dump_stack(); + return result; +} + +unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask) +{ + unsigned long result; + + if (toi_alloc_ops.enabled) + MIGHT_FAIL(fail_num, 0); + mask |= ___GFP_TOI_NOTRACK; + result = get_zeroed_page(mask); + if (toi_alloc_ops.enabled) + alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); + if (fail_num == toi_trace_allocs) + dump_stack(); + return result; +} + +void toi_kfree(int fail_num, const void *arg, int size) +{ + if (arg && toi_alloc_ops.enabled) + free_update_stats(fail_num, size); + + if (fail_num == toi_trace_allocs) + dump_stack(); + kfree(arg); +} + +void toi_free_page(int fail_num, unsigned long virt) +{ + if (virt && toi_alloc_ops.enabled) + free_update_stats(fail_num, PAGE_SIZE); + + if (fail_num == toi_trace_allocs) + dump_stack(); + free_page(virt); +} + +void toi__free_page(int fail_num, struct page *page) +{ + if (page && toi_alloc_ops.enabled) + free_update_stats(fail_num, PAGE_SIZE); + + if (fail_num == toi_trace_allocs) + dump_stack(); + __free_page(page); +} + +void toi_free_pages(int fail_num, struct page *page, int order) +{ + if (page && toi_alloc_ops.enabled) + free_update_stats(fail_num, PAGE_SIZE << order); + + if (fail_num == toi_trace_allocs) + dump_stack(); + __free_pages(page, order); +} + +void toi_alloc_print_debug_stats(void) +{ + int i, header_done = 0; + + if (!toi_alloc_ops.enabled) + return; + + for (i = 0; i < TOI_ALLOC_PATHS; i++) + if (atomic_read(&toi_alloc_count[i]) != + atomic_read(&toi_free_count[i])) { + if (!header_done) { + printk(KERN_INFO "Idx Allocs Frees Tests " + " Fails Max Description\n"); + header_done = 1; + } + + printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i, + atomic_read(&toi_alloc_count[i]), + atomic_read(&toi_free_count[i]), + atomic_read(&toi_test_count[i]), + atomic_read(&toi_fail_count[i]), + toi_max_allocd[i], + toi_alloc_desc[i]); + } +} + +static int toi_alloc_initialise(int starting_cycle) +{ + int i; + + if (!starting_cycle) + return 0; + + if (toi_trace_allocs) + dump_stack(); + + for (i = 0; i < TOI_ALLOC_PATHS; i++) { + atomic_set(&toi_alloc_count[i], 0); + atomic_set(&toi_free_count[i], 0); + atomic_set(&toi_test_count[i], 0); + atomic_set(&toi_fail_count[i], 0); + toi_cur_allocd[i] = 0; + toi_max_allocd[i] = 0; + }; + + max_allocd = 0; + cur_allocd = 0; + return 0; +} + +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL), + SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0, + NULL), + SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action, + TOI_GET_MAX_MEM_ALLOCD, 0), + SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0, + NULL) +}; + +static struct toi_module_ops toi_alloc_ops = { + .type = MISC_HIDDEN_MODULE, + .name = "allocation debugging", + .directory = "alloc", + .module = THIS_MODULE, + .early = 1, + .initialise = toi_alloc_initialise, + + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +int toi_alloc_init(void) +{ + int result = toi_register_module(&toi_alloc_ops); + return result; +} + +void toi_alloc_exit(void) +{ + toi_unregister_module(&toi_alloc_ops); +} diff -Nur linux-4.2/kernel/power/tuxonice_alloc.h linux-4.2-pck/kernel/power/tuxonice_alloc.h --- linux-4.2/kernel/power/tuxonice_alloc.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_alloc.h 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,54 @@ +/* + * kernel/power/tuxonice_alloc.h + * + * Copyright (C) 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + */ + +#include +#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN) +#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN) + +#ifdef CONFIG_PM_DEBUG +extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags); +extern void toi_kfree(int fail_num, const void *arg, int size); + +extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask, + unsigned int order); +#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0) +extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask); +extern void toi_free_page(int fail_num, unsigned long buf); +extern void toi__free_page(int fail_num, struct page *page); +extern void toi_free_pages(int fail_num, struct page *page, int order); +extern struct page *toi_alloc_page(int fail_num, gfp_t mask); +extern int toi_alloc_init(void); +extern void toi_alloc_exit(void); + +extern void toi_alloc_print_debug_stats(void); + +#else /* CONFIG_PM_DEBUG */ + +#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS)) +#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN)) + +#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER) +#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS) +#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS) +#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0) +#define toi__free_page(FAIL, PAGE) __free_page(PAGE) +#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER) +#define toi_alloc_page(FAIL, MASK) alloc_page(MASK) +static inline int toi_alloc_init(void) +{ + return 0; +} + +static inline void toi_alloc_exit(void) { } + +static inline void toi_alloc_print_debug_stats(void) { } + +#endif + +extern int toi_trace_allocs; diff -Nur linux-4.2/kernel/power/tuxonice_atomic_copy.c linux-4.2-pck/kernel/power/tuxonice_atomic_copy.c --- linux-4.2/kernel/power/tuxonice_atomic_copy.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_atomic_copy.c 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,469 @@ +/* + * kernel/power/tuxonice_atomic_copy.c + * + * Copyright 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * Distributed under GPLv2. + * + * Routines for doing the atomic save/restore. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "tuxonice.h" +#include "tuxonice_storage.h" +#include "tuxonice_power_off.h" +#include "tuxonice_ui.h" +#include "tuxonice_io.h" +#include "tuxonice_prepare_image.h" +#include "tuxonice_pageflags.h" +#include "tuxonice_checksum.h" +#include "tuxonice_builtin.h" +#include "tuxonice_atomic_copy.h" +#include "tuxonice_alloc.h" +#include "tuxonice_modules.h" + +unsigned long extra_pd1_pages_used; + +/** + * free_pbe_list - free page backup entries used by the atomic copy code. + * @list: List to free. + * @highmem: Whether the list is in highmem. + * + * Normally, this function isn't used. If, however, we need to abort before + * doing the atomic copy, we use this to free the pbes previously allocated. + **/ +static void free_pbe_list(struct pbe **list, int highmem) +{ + while (*list) { + int i; + struct pbe *free_pbe, *next_page = NULL; + struct page *page; + + if (highmem) { + page = (struct page *) *list; + free_pbe = (struct pbe *) kmap(page); + } else { + page = virt_to_page(*list); + free_pbe = *list; + } + + for (i = 0; i < PBES_PER_PAGE; i++) { + if (!free_pbe) + break; + if (highmem) + toi__free_page(29, free_pbe->address); + else + toi_free_page(29, + (unsigned long) free_pbe->address); + free_pbe = free_pbe->next; + } + + if (highmem) { + if (free_pbe) + next_page = free_pbe; + kunmap(page); + } else { + if (free_pbe) + next_page = free_pbe; + } + + toi__free_page(29, page); + *list = (struct pbe *) next_page; + }; +} + +/** + * copyback_post - post atomic-restore actions + * + * After doing the atomic restore, we have a few more things to do: + * 1) We want to retain some values across the restore, so we now copy + * these from the nosave variables to the normal ones. + * 2) Set the status flags. + * 3) Resume devices. + * 4) Tell userui so it can redraw & restore settings. + * 5) Reread the page cache. + **/ +void copyback_post(void) +{ + struct toi_boot_kernel_data *bkd = + (struct toi_boot_kernel_data *) boot_kernel_data_buffer; + + if (toi_activate_storage(1)) + panic("Failed to reactivate our storage."); + + toi_post_atomic_restore_modules(bkd); + + toi_cond_pause(1, "About to reload secondary pagedir."); + + if (read_pageset2(0)) + panic("Unable to successfully reread the page cache."); + + /* + * If the user wants to sleep again after resuming from full-off, + * it's most likely to be in order to suspend to ram, so we'll + * do this check after loading pageset2, to give them the fastest + * wakeup when they are ready to use the computer again. + */ + toi_check_resleep(); + + if (test_action_state(TOI_INCREMENTAL_IMAGE)) + toi_reset_dirtiness(1); +} + +/** + * toi_copy_pageset1 - do the atomic copy of pageset1 + * + * Make the atomic copy of pageset1. We can't use copy_page (as we once did) + * because we can't be sure what side effects it has. On my old Duron, with + * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt + * count at resume time 4 instead of 3. + * + * We don't want to call kmap_atomic unconditionally because it has the side + * effect of incrementing the preempt count, which will leave it one too high + * post resume (the page containing the preempt count will be copied after + * its incremented. This is essentially the same problem. + **/ +void toi_copy_pageset1(void) +{ + int i; + unsigned long source_index, dest_index; + + memory_bm_position_reset(pageset1_map); + memory_bm_position_reset(pageset1_copy_map); + + source_index = memory_bm_next_pfn(pageset1_map, 0); + dest_index = memory_bm_next_pfn(pageset1_copy_map, 0); + + for (i = 0; i < pagedir1.size; i++) { + unsigned long *origvirt, *copyvirt; + struct page *origpage, *copypage; + int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1, + was_present1, was_present2; + + origpage = pfn_to_page(source_index); + copypage = pfn_to_page(dest_index); + + origvirt = PageHighMem(origpage) ? + kmap_atomic(origpage) : + page_address(origpage); + + copyvirt = PageHighMem(copypage) ? + kmap_atomic(copypage) : + page_address(copypage); + + was_present1 = kernel_page_present(origpage); + if (!was_present1) + kernel_map_pages(origpage, 1, 1); + + was_present2 = kernel_page_present(copypage); + if (!was_present2) + kernel_map_pages(copypage, 1, 1); + + while (loop >= 0) { + *(copyvirt + loop) = *(origvirt + loop); + loop--; + } + + if (!was_present1) + kernel_map_pages(origpage, 1, 0); + + if (!was_present2) + kernel_map_pages(copypage, 1, 0); + + if (PageHighMem(origpage)) + kunmap_atomic(origvirt); + + if (PageHighMem(copypage)) + kunmap_atomic(copyvirt); + + source_index = memory_bm_next_pfn(pageset1_map, 0); + dest_index = memory_bm_next_pfn(pageset1_copy_map, 0); + } +} + +/** + * __toi_post_context_save - steps after saving the cpu context + * + * Steps taken after saving the CPU state to make the actual + * atomic copy. + * + * Called from swsusp_save in snapshot.c via toi_post_context_save. + **/ +int __toi_post_context_save(void) +{ + unsigned long old_ps1_size = pagedir1.size; + + check_checksums(); + + free_checksum_pages(); + + toi_recalculate_image_contents(1); + + extra_pd1_pages_used = pagedir1.size > old_ps1_size ? + pagedir1.size - old_ps1_size : 0; + + if (extra_pd1_pages_used > extra_pd1_pages_allowance) { + printk(KERN_INFO "Pageset1 has grown by %lu pages. " + "extra_pages_allowance is currently only %lu.\n", + pagedir1.size - old_ps1_size, + extra_pd1_pages_allowance); + + /* + * Highlevel code will see this, clear the state and + * retry if we haven't already done so twice. + */ + if (any_to_free(1)) { + set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); + return 1; + } + if (try_allocate_extra_memory()) { + printk(KERN_INFO "Failed to allocate the extra memory" + " needed. Restarting the process."); + set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); + return 1; + } + printk(KERN_INFO "However it looks like there's enough" + " free ram and storage to handle this, so " + " continuing anyway."); + /* + * What if try_allocate_extra_memory above calls + * toi_allocate_extra_pagedir_memory and it allocs a new + * slab page via toi_kzalloc which should be in ps1? So... + */ + toi_recalculate_image_contents(1); + } + + if (!test_action_state(TOI_TEST_FILTER_SPEED) && + !test_action_state(TOI_TEST_BIO)) + toi_copy_pageset1(); + + return 0; +} + +/** + * toi_hibernate - high level code for doing the atomic copy + * + * High-level code which prepares to do the atomic copy. Loosely based + * on the swsusp version, but with the following twists: + * - We set toi_running so the swsusp code uses our code paths. + * - We give better feedback regarding what goes wrong if there is a + * problem. + * - We use an extra function to call the assembly, just in case this code + * is in a module (return address). + **/ +int toi_hibernate(void) +{ + int error; + + error = toi_lowlevel_builtin(); + + if (!error) { + struct toi_boot_kernel_data *bkd = + (struct toi_boot_kernel_data *) boot_kernel_data_buffer; + + /* + * The boot kernel's data may be larger (newer version) or + * smaller (older version) than ours. Copy the minimum + * of the two sizes, so that we don't overwrite valid values + * from pre-atomic copy. + */ + + memcpy(&toi_bkd, (char *) boot_kernel_data_buffer, + min_t(int, sizeof(struct toi_boot_kernel_data), + bkd->size)); + } + + return error; +} + +/** + * toi_atomic_restore - prepare to do the atomic restore + * + * Get ready to do the atomic restore. This part gets us into the same + * state we are in prior to do calling do_toi_lowlevel while + * hibernating: hot-unplugging secondary cpus and freeze processes, + * before starting the thread that will do the restore. + **/ +int toi_atomic_restore(void) +{ + int error; + + toi_prepare_status(DONT_CLEAR_BAR, "Atomic restore."); + + memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line, + strlen(saved_command_line)); + + toi_pre_atomic_restore_modules(&toi_bkd); + + if (add_boot_kernel_data_pbe()) + goto Failed; + + toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); + + if (toi_go_atomic(PMSG_QUIESCE, 0)) + goto Failed; + + /* We'll ignore saved state, but this gets preempt count (etc) right */ + save_processor_state(); + + error = swsusp_arch_resume(); + /* + * Code below is only ever reached in case of failure. Otherwise + * execution continues at place where swsusp_arch_suspend was called. + * + * We don't know whether it's safe to continue (this shouldn't happen), + * so lets err on the side of caution. + */ + BUG(); + +Failed: + free_pbe_list(&restore_pblist, 0); +#ifdef CONFIG_HIGHMEM + free_pbe_list(&restore_highmem_pblist, 1); +#endif + return 1; +} + +/** + * toi_go_atomic - do the actual atomic copy/restore + * @state: The state to use for dpm_suspend_start & power_down calls. + * @suspend_time: Whether we're suspending or resuming. + **/ +int toi_go_atomic(pm_message_t state, int suspend_time) +{ + if (suspend_time) { + if (platform_begin(1)) { + set_abort_result(TOI_PLATFORM_PREP_FAILED); + toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); + return 1; + } + + if (dpm_prepare(PMSG_FREEZE)) { + set_abort_result(TOI_DPM_PREPARE_FAILED); + dpm_complete(PMSG_RECOVER); + toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); + return 1; + } + } + + suspend_console(); + pm_restrict_gfp_mask(); + + if (suspend_time) { + if (dpm_suspend(state)) { + set_abort_result(TOI_DPM_SUSPEND_FAILED); + toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); + return 1; + } + } else { + if (dpm_suspend_start(state)) { + set_abort_result(TOI_DPM_SUSPEND_FAILED); + toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); + return 1; + } + } + + /* At this point, dpm_suspend_start() has been called, but *not* + * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now. + * Otherwise, drivers for some devices (e.g. interrupt controllers) + * become desynchronized with the actual state of the hardware + * at resume time, and evil weirdness ensues. + */ + + if (dpm_suspend_end(state)) { + set_abort_result(TOI_DEVICE_REFUSED); + toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1); + return 1; + } + + if (suspend_time) { + if (platform_pre_snapshot(1)) + set_abort_result(TOI_PRE_SNAPSHOT_FAILED); + } else { + if (platform_pre_restore(1)) + set_abort_result(TOI_PRE_RESTORE_FAILED); + } + + if (test_result_state(TOI_ABORTED)) { + toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1); + return 1; + } + + if (disable_nonboot_cpus()) { + set_abort_result(TOI_CPU_HOTPLUG_FAILED); + toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG, + suspend_time, 1); + return 1; + } + + local_irq_disable(); + + if (syscore_suspend()) { + set_abort_result(TOI_SYSCORE_REFUSED); + toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1); + return 1; + } + + if (suspend_time && pm_wakeup_pending()) { + set_abort_result(TOI_WAKEUP_EVENT); + toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1); + return 1; + } + return 0; +} + +/** + * toi_end_atomic - post atomic copy/restore routines + * @stage: What step to start at. + * @suspend_time: Whether we're suspending or resuming. + * @error: Whether we're recovering from an error. + **/ +void toi_end_atomic(int stage, int suspend_time, int error) +{ + pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) : + PMSG_RESTORE; + + switch (stage) { + case ATOMIC_ALL_STEPS: + if (!suspend_time) { + events_check_enabled = false; + } + platform_leave(1); + case ATOMIC_STEP_SYSCORE_RESUME: + syscore_resume(); + case ATOMIC_STEP_IRQS: + local_irq_enable(); + case ATOMIC_STEP_CPU_HOTPLUG: + enable_nonboot_cpus(); + case ATOMIC_STEP_PLATFORM_FINISH: + if (!suspend_time && error & 2) + platform_restore_cleanup(1); + else + platform_finish(1); + dpm_resume_start(msg); + case ATOMIC_STEP_DEVICE_RESUME: + if (suspend_time && (error & 2)) + platform_recover(1); + dpm_resume(msg); + if (!toi_in_suspend()) { + dpm_resume_end(PMSG_RECOVER); + } + if (error || !toi_in_suspend()) { + pm_restore_gfp_mask(); + } + resume_console(); + case ATOMIC_STEP_DPM_COMPLETE: + dpm_complete(msg); + case ATOMIC_STEP_PLATFORM_END: + platform_end(1); + + toi_prepare_status(DONT_CLEAR_BAR, "Post atomic."); + } +} diff -Nur linux-4.2/kernel/power/tuxonice_atomic_copy.h linux-4.2-pck/kernel/power/tuxonice_atomic_copy.h --- linux-4.2/kernel/power/tuxonice_atomic_copy.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_atomic_copy.h 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,25 @@ +/* + * kernel/power/tuxonice_atomic_copy.h + * + * Copyright 2008-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * Distributed under GPLv2. + * + * Routines for doing the atomic save/restore. + */ + +enum { + ATOMIC_ALL_STEPS, + ATOMIC_STEP_SYSCORE_RESUME, + ATOMIC_STEP_IRQS, + ATOMIC_STEP_CPU_HOTPLUG, + ATOMIC_STEP_PLATFORM_FINISH, + ATOMIC_STEP_DEVICE_RESUME, + ATOMIC_STEP_DPM_COMPLETE, + ATOMIC_STEP_PLATFORM_END, +}; + +int toi_go_atomic(pm_message_t state, int toi_time); +void toi_end_atomic(int stage, int toi_time, int error); + +extern void platform_recover(int platform_mode); diff -Nur linux-4.2/kernel/power/tuxonice_bio.h linux-4.2-pck/kernel/power/tuxonice_bio.h --- linux-4.2/kernel/power/tuxonice_bio.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_bio.h 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,78 @@ +/* + * kernel/power/tuxonice_bio.h + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * Distributed under GPLv2. + * + * This file contains declarations for functions exported from + * tuxonice_bio.c, which contains low level io functions. + */ + +#include +#include "tuxonice_extent.h" + +void toi_put_extent_chain(struct hibernate_extent_chain *chain); +int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, + unsigned long start, unsigned long end); + +struct hibernate_extent_saved_state { + int extent_num; + struct hibernate_extent *extent_ptr; + unsigned long offset; +}; + +struct toi_bdev_info { + struct toi_bdev_info *next; + struct hibernate_extent_chain blocks; + struct block_device *bdev; + struct toi_module_ops *allocator; + int allocator_index; + struct hibernate_extent_chain allocations; + char name[266]; /* "swap on " or "file " + up to 256 chars */ + + /* Saved in header */ + char uuid[17]; + dev_t dev_t; + int prio; + int bmap_shift; + int blocks_per_page; + unsigned long pages_used; + struct hibernate_extent_saved_state saved_state[4]; +}; + +struct toi_extent_iterate_state { + struct toi_bdev_info *current_chain; + int num_chains; + int saved_chain_number[4]; + struct toi_bdev_info *saved_chain_ptr[4]; +}; + +/* + * Our exported interface so the swapwriter and filewriter don't + * need these functions duplicated. + */ +struct toi_bio_ops { + int (*bdev_page_io) (int rw, struct block_device *bdev, long pos, + struct page *page); + int (*register_storage)(struct toi_bdev_info *new); + void (*free_storage)(void); +}; + +struct toi_allocator_ops { + unsigned long (*toi_swap_storage_available) (void); +}; + +extern struct toi_bio_ops toi_bio_ops; + +extern char *toi_writer_buffer; +extern int toi_writer_buffer_posn; + +struct toi_bio_allocator_ops { + int (*register_storage) (void); + unsigned long (*storage_available)(void); + int (*allocate_storage) (struct toi_bdev_info *, unsigned long); + int (*bmap) (struct toi_bdev_info *); + void (*free_storage) (struct toi_bdev_info *); + unsigned long (*free_unused_storage) (struct toi_bdev_info *, unsigned long used); +}; diff -Nur linux-4.2/kernel/power/tuxonice_bio_chains.c linux-4.2-pck/kernel/power/tuxonice_bio_chains.c --- linux-4.2/kernel/power/tuxonice_bio_chains.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_bio_chains.c 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,1126 @@ +/* + * kernel/power/tuxonice_bio_devinfo.c + * + * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * Distributed under GPLv2. + * + */ + +#include +#include "tuxonice_bio.h" +#include "tuxonice_bio_internal.h" +#include "tuxonice_alloc.h" +#include "tuxonice_ui.h" +#include "tuxonice.h" +#include "tuxonice_io.h" + +static struct toi_bdev_info *prio_chain_head; +static int num_chains; + +/* Pointer to current entry being loaded/saved. */ +struct toi_extent_iterate_state toi_writer_posn; + +#define metadata_size (sizeof(struct toi_bdev_info) - \ + offsetof(struct toi_bdev_info, uuid)) + +/* + * After section 0 (header) comes 2 => next_section[0] = 2 + */ +static int next_section[3] = { 2, 3, 1 }; + +/** + * dump_block_chains - print the contents of the bdev info array. + **/ +void dump_block_chains(void) +{ + int i = 0; + int j; + struct toi_bdev_info *cur_chain = prio_chain_head; + + while (cur_chain) { + struct hibernate_extent *this = cur_chain->blocks.first; + + printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio); + + while (this) { + printk(KERN_CONT " [%lu-%lu]%s", this->start, + this->end, this->next ? "," : ""); + this = this->next; + } + + printk("\n"); + cur_chain = cur_chain->next; + i++; + } + + printk(KERN_DEBUG "Saved states:\n"); + for (i = 0; i < 4; i++) { + printk(KERN_DEBUG "Slot %d: Chain %d.\n", + i, toi_writer_posn.saved_chain_number[i]); + + cur_chain = prio_chain_head; + j = 0; + while (cur_chain) { + printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n", + j, cur_chain->saved_state[i].extent_num, + cur_chain->saved_state[i].offset); + cur_chain = cur_chain->next; + j++; + } + printk(KERN_CONT "\n"); + } +} + +/** + * + **/ +static void toi_extent_chain_next(void) +{ + struct toi_bdev_info *this = toi_writer_posn.current_chain; + + if (!this->blocks.current_extent) + return; + + if (this->blocks.current_offset == this->blocks.current_extent->end) { + if (this->blocks.current_extent->next) { + this->blocks.current_extent = + this->blocks.current_extent->next; + this->blocks.current_offset = + this->blocks.current_extent->start; + } else { + this->blocks.current_extent = NULL; + this->blocks.current_offset = 0; + } + } else + this->blocks.current_offset++; +} + +/** + * + */ + +static struct toi_bdev_info *__find_next_chain_same_prio(void) +{ + struct toi_bdev_info *start_chain = toi_writer_posn.current_chain; + struct toi_bdev_info *this = start_chain; + int orig_prio = this->prio; + + do { + this = this->next; + + if (!this) + this = prio_chain_head; + + /* Back on original chain? Use it again. */ + if (this == start_chain) + return start_chain; + + } while (!this->blocks.current_extent || this->prio != orig_prio); + + return this; +} + +static void find_next_chain(void) +{ + struct toi_bdev_info *this; + + this = __find_next_chain_same_prio(); + + /* + * If we didn't get another chain of the same priority that we + * can use, look for the next priority. + */ + while (this && !this->blocks.current_extent) + this = this->next; + + toi_writer_posn.current_chain = this; +} + +/** + * toi_extent_state_next - go to the next extent + * @blocks: The number of values to progress. + * @stripe_mode: Whether to spread usage across all chains. + * + * Given a state, progress to the next valid entry. We may begin in an + * invalid state, as we do when invoked after extent_state_goto_start below. + * + * When using compression and expected_compression > 0, we let the image size + * be larger than storage, so we can validly run out of data to return. + **/ +static unsigned long toi_extent_state_next(int blocks, int current_stream) +{ + int i; + + if (!toi_writer_posn.current_chain) + return -ENOSPC; + + /* Assume chains always have lengths that are multiples of @blocks */ + for (i = 0; i < blocks; i++) + toi_extent_chain_next(); + + /* The header stream is not striped */ + if (current_stream || + !toi_writer_posn.current_chain->blocks.current_extent) + find_next_chain(); + + return toi_writer_posn.current_chain ? 0 : -ENOSPC; +} + +static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this) +{ + struct toi_bdev_info **prev_ptr; + struct toi_bdev_info *cur; + + /* Loop through the existing chain, finding where to insert it */ + prev_ptr = &prio_chain_head; + cur = prio_chain_head; + + while (cur && cur->prio >= this->prio) { + prev_ptr = &cur->next; + cur = cur->next; + } + + this->next = *prev_ptr; + *prev_ptr = this; + + this = prio_chain_head; + while (this) + this = this->next; + num_chains++; +} + +/** + * toi_extent_state_goto_start - reinitialize an extent chain iterator + * @state: Iterator to reinitialize + **/ +void toi_extent_state_goto_start(void) +{ + struct toi_bdev_info *this = prio_chain_head; + + while (this) { + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "Setting current extent to %p.", this->blocks.first); + this->blocks.current_extent = this->blocks.first; + if (this->blocks.current_extent) { + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "Setting current offset to %lu.", + this->blocks.current_extent->start); + this->blocks.current_offset = + this->blocks.current_extent->start; + } + + this = this->next; + } + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.", + prio_chain_head); + toi_writer_posn.current_chain = prio_chain_head; + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start."); +} + +/** + * toi_extent_state_save - save state of the iterator + * @state: Current state of the chain + * @saved_state: Iterator to populate + * + * Given a state and a struct hibernate_extent_state_store, save the current + * position in a format that can be used with relocated chains (at + * resume time). + **/ +void toi_extent_state_save(int slot) +{ + struct toi_bdev_info *cur_chain = prio_chain_head; + struct hibernate_extent *extent; + struct hibernate_extent_saved_state *chain_state; + int i = 0; + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.", + slot); + + if (!toi_writer_posn.current_chain) { + toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => " + "chain_num = -1."); + toi_writer_posn.saved_chain_number[slot] = -1; + return; + } + + while (cur_chain) { + i++; + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) " + "state, slot %d.", i, cur_chain, slot); + + chain_state = &cur_chain->saved_state[slot]; + + chain_state->offset = cur_chain->blocks.current_offset; + + if (toi_writer_posn.current_chain == cur_chain) { + toi_writer_posn.saved_chain_number[slot] = i; + toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain " + "we were on => chain_num is %d.", i); + } + + if (!cur_chain->blocks.current_extent) { + chain_state->extent_num = 0; + toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent " + "for this chain => extent_num %d is 0.", + i); + cur_chain = cur_chain->next; + continue; + } + + extent = cur_chain->blocks.first; + chain_state->extent_num = 1; + + while (extent != cur_chain->blocks.current_extent) { + chain_state->extent_num++; + extent = extent->next; + } + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i, + chain_state->extent_num); + + cur_chain = cur_chain->next; + } + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "Completed saving extent state slot %d.", slot); +} + +/** + * toi_extent_state_restore - restore the position saved by extent_state_save + * @state: State to populate + * @saved_state: Iterator saved to restore + **/ +void toi_extent_state_restore(int slot) +{ + int i = 0; + struct toi_bdev_info *cur_chain = prio_chain_head; + struct hibernate_extent_saved_state *chain_state; + + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "toi_extent_state_restore - slot %d.", slot); + + if (toi_writer_posn.saved_chain_number[slot] == -1) { + toi_writer_posn.current_chain = NULL; + return; + } + + while (cur_chain) { + int posn; + int j; + i++; + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) " + "state, slot %d.", i, cur_chain, slot); + + chain_state = &cur_chain->saved_state[slot]; + + posn = chain_state->extent_num; + + cur_chain->blocks.current_extent = cur_chain->blocks.first; + cur_chain->blocks.current_offset = chain_state->offset; + + if (i == toi_writer_posn.saved_chain_number[slot]) { + toi_writer_posn.current_chain = cur_chain; + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "Found current chain."); + } + + for (j = 0; j < 4; j++) + if (i == toi_writer_posn.saved_chain_number[j]) { + toi_writer_posn.saved_chain_ptr[j] = cur_chain; + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "Found saved chain ptr %d (%p) (offset" + " %d).", j, cur_chain, + cur_chain->saved_state[j].offset); + } + + if (posn) { + while (--posn) + cur_chain->blocks.current_extent = + cur_chain->blocks.current_extent->next; + } else + cur_chain->blocks.current_extent = NULL; + + cur_chain = cur_chain->next; + } + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done."); + if (test_action_state(TOI_LOGALL)) + dump_block_chains(); +} + +/* + * Storage needed + * + * Returns amount of space in the image header required + * for the chain data. This ignores the links between + * pages, which we factor in when allocating the space. + */ +int toi_bio_devinfo_storage_needed(void) +{ + int result = sizeof(num_chains); + struct toi_bdev_info *chain = prio_chain_head; + + while (chain) { + result += metadata_size; + + /* Chain size */ + result += sizeof(int); + + /* Extents */ + result += (2 * sizeof(unsigned long) * + chain->blocks.num_extents); + + chain = chain->next; + } + + result += 4 * sizeof(int); + return result; +} + +static unsigned long chain_pages_used(struct toi_bdev_info *chain) +{ + struct hibernate_extent *this = chain->blocks.first; + struct hibernate_extent_saved_state *state = &chain->saved_state[3]; + unsigned long size = 0; + int extent_idx = 1; + + if (!state->extent_num) { + if (!this) + return 0; + else + return chain->blocks.size; + } + + while (extent_idx < state->extent_num) { + size += (this->end - this->start + 1); + this = this->next; + extent_idx++; + } + + /* We didn't use the one we're sitting on, so don't count it */ + return size + state->offset - this->start; +} + +void toi_bio_free_unused_storage_chain(struct toi_bdev_info *chain) +{ + unsigned long used = chain_pages_used(chain); + + /* Free the storage */ + unsigned long first_freed = 0; + + if (chain->allocator->bio_allocator_ops->free_unused_storage) + first_freed = chain->allocator->bio_allocator_ops->free_unused_storage(chain, used); + + printk(KERN_EMERG "Used %ld blocks in this chain. First extent freed is %lx.\n", used, first_freed); + + /* Adjust / free the extents. */ + toi_put_extent_chain_from(&chain->blocks, first_freed); + + { + struct hibernate_extent *this = chain->blocks.first; + while (this) { + printk("Extent %lx-%lx.\n", this->start, this->end); + this = this->next; + } + } +} + +/** + * toi_serialise_extent_chain - write a chain in the image + * @chain: Chain to write. + **/ +static int toi_serialise_extent_chain(struct toi_bdev_info *chain) +{ + struct hibernate_extent *this; + int ret; + int i = 1; + + chain->pages_used = chain_pages_used(chain); + + if (test_action_state(TOI_LOGALL)) + dump_block_chains(); + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).", + chain->dev_t); + /* Device info - dev_t, prio, bmap_shift, blocks per page, positions */ + ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, + (char *) &chain->uuid, metadata_size); + if (ret) + return ret; + + /* Num extents */ + ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, + (char *) &chain->blocks.num_extents, sizeof(int)); + if (ret) + return ret; + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", + chain->blocks.num_extents); + + this = chain->blocks.first; + while (this) { + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i); + ret = toiActiveAllocator->rw_header_chunk(WRITE, + &toi_blockwriter_ops, + (char *) this, 2 * sizeof(this->start)); + if (ret) + return ret; + this = this->next; + i++; + } + + return ret; +} + +int toi_serialise_extent_chains(void) +{ + struct toi_bdev_info *this = prio_chain_head; + int result; + + /* Write the number of chains */ + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)", + num_chains); + result = toiActiveAllocator->rw_header_chunk(WRITE, + &toi_blockwriter_ops, (char *) &num_chains, + sizeof(int)); + if (result) + return result; + + /* Then the chains themselves */ + while (this) { + result = toi_serialise_extent_chain(this); + if (result) + return result; + this = this->next; + } + + /* + * Finally, the chain we should be on at the start of each + * section. + */ + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers."); + result = toiActiveAllocator->rw_header_chunk(WRITE, + &toi_blockwriter_ops, + (char *) &toi_writer_posn.saved_chain_number[0], + 4 * sizeof(int)); + + return result; +} + +int toi_register_storage_chain(struct toi_bdev_info *new) +{ + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.", + new); + toi_insert_chain_in_prio_list(new); + return 0; +} + +static void free_bdev_info(struct toi_bdev_info *chain) +{ + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain); + + toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents."); + toi_put_extent_chain(&chain->blocks); + + /* + * The allocator may need to do more than just free the chains + * (swap_free, for example). Don't call from boot kernel. + */ + toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents."); + if (chain->allocator) + chain->allocator->bio_allocator_ops->free_storage(chain); + + /* + * Dropping out of reading atomic copy? Need to undo + * toi_open_by_devnum. + */ + toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev."); + if (chain->bdev && !IS_ERR(chain->bdev) && + chain->bdev != resume_block_device && + chain->bdev != header_block_device && + test_toi_state(TOI_TRYING_TO_RESUME)) + toi_close_bdev(chain->bdev); + + /* Poison */ + toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct."); + toi_kfree(39, chain, sizeof(*chain)); + + if (prio_chain_head == chain) + prio_chain_head = NULL; + + num_chains--; +} + +void free_all_bdev_info(void) +{ + struct toi_bdev_info *this = prio_chain_head; + + while (this) { + struct toi_bdev_info *next = this->next; + free_bdev_info(this); + this = next; + } + + memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn)); + prio_chain_head = NULL; +} + +static void set_up_start_position(void) +{ + toi_writer_posn.current_chain = prio_chain_head; + go_next_page(0, 0); +} + +/** + * toi_load_extent_chain - read back a chain saved in the image + * @chain: Chain to load + * + * The linked list of extents is reconstructed from the disk. chain will point + * to the first entry. + **/ +int toi_load_extent_chain(int index, int *num_loaded) +{ + struct toi_bdev_info *chain = toi_kzalloc(39, + sizeof(struct toi_bdev_info), GFP_ATOMIC); + struct hibernate_extent *this, *last = NULL; + int i, ret; + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index); + /* Get dev_t, prio, bmap_shift, blocks per page, positions */ + ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, + (char *) &chain->uuid, metadata_size); + + if (ret) { + printk(KERN_ERR "Failed to read the size of extent chain.\n"); + toi_kfree(39, chain, sizeof(*chain)); + return 1; + } + + toi_bkd.pages_used[index] = chain->pages_used; + + ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, + (char *) &chain->blocks.num_extents, sizeof(int)); + if (ret) { + printk(KERN_ERR "Failed to read the size of extent chain.\n"); + toi_kfree(39, chain, sizeof(*chain)); + return 1; + } + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", + chain->blocks.num_extents); + + for (i = 0; i < chain->blocks.num_extents; i++) { + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1); + + this = toi_kzalloc(2, sizeof(struct hibernate_extent), + TOI_ATOMIC_GFP); + if (!this) { + printk(KERN_INFO "Failed to allocate a new extent.\n"); + free_bdev_info(chain); + return -ENOMEM; + } + this->next = NULL; + /* Get the next page */ + ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, + NULL, (char *) this, 2 * sizeof(this->start)); + if (ret) { + printk(KERN_INFO "Failed to read an extent.\n"); + toi_kfree(2, this, sizeof(struct hibernate_extent)); + free_bdev_info(chain); + return 1; + } + + if (last) + last->next = this; + else { + char b1[32], b2[32], b3[32]; + /* + * Open the bdev + */ + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "Chain dev_t is %s. Resume dev t is %s. Header" + " bdev_t is %s.\n", + format_dev_t(b1, chain->dev_t), + format_dev_t(b2, resume_dev_t), + format_dev_t(b3, toi_sig_data->header_dev_t)); + + if (chain->dev_t == resume_dev_t) + chain->bdev = resume_block_device; + else if (chain->dev_t == toi_sig_data->header_dev_t) + chain->bdev = header_block_device; + else { + chain->bdev = toi_open_bdev(chain->uuid, + chain->dev_t, 1); + if (IS_ERR(chain->bdev)) { + free_bdev_info(chain); + return -ENODEV; + } + } + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift " + "is %d and blocks per page is %d.", + chain->bmap_shift, + chain->blocks_per_page); + + chain->blocks.first = this; + + /* + * Couldn't do this earlier, but can't do + * goto_start now - we may have already used blocks + * in the first chain. + */ + chain->blocks.current_extent = this; + chain->blocks.current_offset = this->start; + + /* + * Can't wait until we've read the whole chain + * before we insert it in the list. We might need + * this chain to read the next page in the header + */ + toi_insert_chain_in_prio_list(chain); + } + + /* + * We have to wait until 2 extents are loaded before setting up + * properly because if the first extent has only one page, we + * will need to put the position on the second extent. Sounds + * obvious, but it wasn't! + */ + (*num_loaded)++; + if ((*num_loaded) == 2) + set_up_start_position(); + last = this; + } + + /* + * Shouldn't get empty chains, but it's not impossible. Link them in so + * they get freed properly later. + */ + if (!chain->blocks.num_extents) + toi_insert_chain_in_prio_list(chain); + + if (!chain->blocks.current_extent) { + chain->blocks.current_extent = chain->blocks.first; + if (chain->blocks.current_extent) + chain->blocks.current_offset = + chain->blocks.current_extent->start; + } + return 0; +} + +int toi_load_extent_chains(void) +{ + int result; + int to_load; + int i; + int extents_loaded = 0; + + result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, + (char *) &to_load, + sizeof(int)); + if (result) + return result; + toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load); + + for (i = 0; i < to_load; i++) { + toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.", + i, to_load); + result = toi_load_extent_chain(i, &extents_loaded); + if (result) + return result; + } + + /* If we never got to a second extent, we still need to do this. */ + if (extents_loaded == 1) + set_up_start_position(); + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers."); + result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, + &toi_blockwriter_ops, + (char *) &toi_writer_posn.saved_chain_number[0], + 4 * sizeof(int)); + + return result; +} + +static int toi_end_of_stream(int writing, int section_barrier) +{ + struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; + int compare_to = next_section[current_stream]; + struct toi_bdev_info *compare_chain = + toi_writer_posn.saved_chain_ptr[compare_to]; + int compare_offset = compare_chain ? + compare_chain->saved_state[compare_to].offset : 0; + + if (!section_barrier) + return 0; + + if (!cur_chain) + return 1; + + if (cur_chain == compare_chain && + cur_chain->blocks.current_offset == compare_offset) { + if (writing) { + if (!current_stream) { + debug_broken_header(); + return 1; + } + } else { + more_readahead = 0; + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "Reached the end of stream %d " + "(not an error).", current_stream); + return 1; + } + } + + return 0; +} + +/** + * go_next_page - skip blocks to the start of the next page + * @writing: Whether we're reading or writing the image. + * + * Go forward one page. + **/ +int go_next_page(int writing, int section_barrier) +{ + struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; + int max = cur_chain ? cur_chain->blocks_per_page : 1; + + /* Nope. Go foward a page - or maybe two. Don't stripe the header, + * so that bad fragmentation doesn't put the extent data containing + * the location of the second page out of the first header page. + */ + if (toi_extent_state_next(max, current_stream)) { + /* Don't complain if readahead falls off the end */ + if (writing && section_barrier) { + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. " + "Expected compression ratio too optimistic?"); + if (test_action_state(TOI_LOGALL)) + dump_block_chains(); + } + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to " + "read/write. (Not necessarily a fatal error."); + return -ENOSPC; + } + + return 0; +} + +int devices_of_same_priority(struct toi_bdev_info *this) +{ + struct toi_bdev_info *check = prio_chain_head; + int i = 0; + + while (check) { + if (check->prio == this->prio) + i++; + check = check->next; + } + + return i; +} + +/** + * toi_bio_rw_page - do i/o on the next disk page in the image + * @writing: Whether reading or writing. + * @page: Page to do i/o on. + * @is_readahead: Whether we're doing readahead + * @free_group: The group used in allocating the page + * + * Submit a page for reading or writing, possibly readahead. + * Pass the group used in allocating the page as well, as it should + * be freed on completion of the bio if we're writing the page. + **/ +int toi_bio_rw_page(int writing, struct page *page, + int is_readahead, int free_group) +{ + int result = toi_end_of_stream(writing, 1); + struct toi_bdev_info *dev_info = toi_writer_posn.current_chain; + + if (result) { + if (writing) + abort_hibernate(TOI_INSUFFICIENT_STORAGE, + "Insufficient storage for your image."); + else + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to " + "read/write another page when stream has " + "ended."); + return -ENOSPC; + } + + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "%s %lx:%ld", + writing ? "Write" : "Read", + dev_info->dev_t, dev_info->blocks.current_offset); + + result = toi_do_io(writing, dev_info->bdev, + dev_info->blocks.current_offset << dev_info->bmap_shift, + page, is_readahead, 0, free_group); + + /* Ignore the result here - will check end of stream if come in again */ + go_next_page(writing, 1); + + if (result) + printk(KERN_ERR "toi_do_io returned %d.\n", result); + return result; +} + +dev_t get_header_dev_t(void) +{ + return prio_chain_head->dev_t; +} + +struct block_device *get_header_bdev(void) +{ + return prio_chain_head->bdev; +} + +unsigned long get_headerblock(void) +{ + return prio_chain_head->blocks.first->start << + prio_chain_head->bmap_shift; +} + +int get_main_pool_phys_params(void) +{ + struct toi_bdev_info *this = prio_chain_head; + int result; + + while (this) { + result = this->allocator->bio_allocator_ops->bmap(this); + if (result) + return result; + this = this->next; + } + + return 0; +} + +static int apply_header_reservation(void) +{ + int i; + + if (!header_pages_reserved) { + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "No header pages reserved at the moment."); + return 0; + } + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation."); + + /* Apply header space reservation */ + toi_extent_state_goto_start(); + + for (i = 0; i < header_pages_reserved; i++) + if (go_next_page(1, 0)) + return -ENOSPC; + + /* The end of header pages will be the start of pageset 2 */ + toi_extent_state_save(2); + + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "Finished applying header reservation."); + return 0; +} + +static int toi_bio_register_storage(void) +{ + int result = 0; + struct toi_module_ops *this_module; + + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!this_module->enabled || + this_module->type != BIO_ALLOCATOR_MODULE) + continue; + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "Registering storage from %s.", + this_module->name); + result = this_module->bio_allocator_ops->register_storage(); + if (result) + break; + } + + return result; +} + +void toi_bio_free_unused_storage(void) +{ + struct toi_bdev_info *this = prio_chain_head; + + while (this) { + toi_bio_free_unused_storage_chain(this); + this = this->next; + } +} + +int toi_bio_allocate_storage(unsigned long request) +{ + struct toi_bdev_info *chain = prio_chain_head; + unsigned long to_get = request; + unsigned long extra_pages, needed; + int no_free = 0; + + if (!chain) { + int result = toi_bio_register_storage(); + toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " + "Registering storage."); + if (result) + return 0; + chain = prio_chain_head; + if (!chain) { + printk("TuxOnIce: No storage was registered.\n"); + return 0; + } + } + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " + "Request is %lu pages.", request); + extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long) + + sizeof(int)), PAGE_SIZE); + needed = request + extra_pages + header_pages_reserved; + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu " + "for header => %lu.", + extra_pages, header_pages_reserved, needed); + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.", + raw_pages_allocd); + + to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0; + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get); + + if (!to_get) + return apply_header_reservation(); + + while (to_get && chain) { + int num_group = devices_of_same_priority(chain); + int divisor = num_group - no_free; + int i; + unsigned long portion = DIV_ROUND_UP(to_get, divisor); + unsigned long got = 0; + unsigned long got_this_round = 0; + struct toi_bdev_info *top = chain; + + toi_message(TOI_BIO, TOI_VERBOSE, 0, + " Start of loop. To get is %lu. Divisor is %d.", + to_get, divisor); + no_free = 0; + + /* + * We're aiming to spread the allocated storage as evenly + * as possible, but we also want to get all the storage we + * can off this priority. + */ + for (i = 0; i < num_group; i++) { + struct toi_bio_allocator_ops *ops = + chain->allocator->bio_allocator_ops; + toi_message(TOI_BIO, TOI_VERBOSE, 0, + " Asking for %lu pages from chain %p.", + portion, chain); + got = ops->allocate_storage(chain, portion); + toi_message(TOI_BIO, TOI_VERBOSE, 0, + " Got %lu pages from allocator %p.", + got, chain); + if (!got) + no_free++; + got_this_round += got; + chain = chain->next; + } + toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a " + "total of %lu pages from %d allocators.", + got_this_round, divisor - no_free); + + raw_pages_allocd += got_this_round; + to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : + 0; + + /* + * If we got anything from chains of this priority and we + * still have storage to allocate, go over this priority + * again. + */ + if (got_this_round && to_get) + chain = top; + else + no_free = 0; + } + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling " + "get_main_pool_phys_params"); + /* Now let swap allocator bmap the pages */ + get_main_pool_phys_params(); + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header."); + return apply_header_reservation(); +} + +void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd) +{ + int i = 0; + struct toi_bdev_info *cur_chain = prio_chain_head; + + while (cur_chain) { + cur_chain->pages_used = bkd->pages_used[i]; + cur_chain = cur_chain->next; + i++; + } +} + +int toi_bio_chains_debug_info(char *buffer, int size) +{ + /* Show what we actually used */ + struct toi_bdev_info *cur_chain = prio_chain_head; + int len = 0; + + while (cur_chain) { + len += scnprintf(buffer + len, size - len, " Used %lu pages " + "from %s.\n", cur_chain->pages_used, + cur_chain->name); + cur_chain = cur_chain->next; + } + + return len; +} + +void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr) +{ + struct toi_bdev_info *this = toi_writer_posn.current_chain, + *cmp = prio_chain_head; + + ptr->save.chain = 1; + while (this != cmp) { + ptr->save.chain++; + cmp = cmp->next; + } + ptr->save.block = this->blocks.current_offset; + + /* Save the raw info internally for quicker access when updating pointers */ + ptr->bdev = this->bdev; + ptr->block = this->blocks.current_offset << this->bmap_shift; +} + +void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr) +{ + int i = ptr->save.chain - 1; + struct toi_bdev_info *this; + struct hibernate_extent *hib; + + /* Find chain by stored index */ + this = prio_chain_head; + while (i) { + this = this->next; + i--; + } + toi_writer_posn.current_chain = this; + + /* Restore block */ + this->blocks.current_offset = ptr->save.block; + + /* Find current offset from block number */ + hib = this->blocks.first; + + while (hib->start > ptr->save.block) { + hib = hib->next; + } + + this->blocks.last_touched = this->blocks.current_extent = hib; +} diff -Nur linux-4.2/kernel/power/tuxonice_bio_core.c linux-4.2-pck/kernel/power/tuxonice_bio_core.c --- linux-4.2/kernel/power/tuxonice_bio_core.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_bio_core.c 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,1933 @@ +/* + * kernel/power/tuxonice_bio.c + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * Distributed under GPLv2. + * + * This file contains block io functions for TuxOnIce. These are + * used by the swapwriter and it is planned that they will also + * be used by the NFSwriter. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "tuxonice.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_modules.h" +#include "tuxonice_prepare_image.h" +#include "tuxonice_bio.h" +#include "tuxonice_ui.h" +#include "tuxonice_alloc.h" +#include "tuxonice_io.h" +#include "tuxonice_builtin.h" +#include "tuxonice_bio_internal.h" + +#define MEMORY_ONLY 1 +#define THROTTLE_WAIT 2 + +/* #define MEASURE_MUTEX_CONTENTION */ +#ifndef MEASURE_MUTEX_CONTENTION +#define my_mutex_lock(index, the_lock) mutex_lock(the_lock) +#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock) +#else +unsigned long mutex_times[2][2][NR_CPUS]; +#define my_mutex_lock(index, the_lock) do { \ + int have_mutex; \ + have_mutex = mutex_trylock(the_lock); \ + if (!have_mutex) { \ + mutex_lock(the_lock); \ + mutex_times[index][0][smp_processor_id()]++; \ + } else { \ + mutex_times[index][1][smp_processor_id()]++; \ + } + +#define my_mutex_unlock(index, the_lock) \ + mutex_unlock(the_lock); \ +} while (0) +#endif + +static int page_idx, reset_idx; + +static int target_outstanding_io = 1024; +static int max_outstanding_writes, max_outstanding_reads; + +static struct page *bio_queue_head, *bio_queue_tail; +static atomic_t toi_bio_queue_size; +static DEFINE_SPINLOCK(bio_queue_lock); + +static int free_mem_throttle, throughput_throttle; +int more_readahead = 1; +static struct page *readahead_list_head, *readahead_list_tail; + +static struct page *waiting_on; + +static atomic_t toi_io_in_progress, toi_io_done; +static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait); + +int current_stream; +/* Not static, so that the allocators can setup and complete + * writing the header */ +char *toi_writer_buffer; +int toi_writer_buffer_posn; + +static DEFINE_MUTEX(toi_bio_mutex); +static DEFINE_MUTEX(toi_bio_readahead_mutex); + +static struct task_struct *toi_queue_flusher; +static int toi_bio_queue_flush_pages(int dedicated_thread); + +struct toi_module_ops toi_blockwriter_ops; + +struct toi_incremental_image_pointer toi_inc_ptr[2][2]; + +#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \ + atomic_read(&toi_bio_queue_size)) + +unsigned long raw_pages_allocd, header_pages_reserved; + +static int toi_rw_buffer(int writing, char *buffer, int buffer_size, + int no_readahead); + +/** + * set_free_mem_throttle - set the point where we pause to avoid oom. + * + * Initially, this value is zero, but when we first fail to allocate memory, + * we set it (plus a buffer) and thereafter throttle i/o once that limit is + * reached. + **/ +static void set_free_mem_throttle(void) +{ + int new_throttle = nr_free_buffer_pages() + 256; + + if (new_throttle > free_mem_throttle) + free_mem_throttle = new_throttle; +} + +#define NUM_REASONS 7 +static atomic_t reasons[NUM_REASONS]; +static char *reason_name[NUM_REASONS] = { + "readahead not ready", + "bio allocation", + "synchronous I/O", + "toi_bio_get_new_page", + "memory low", + "readahead buffer allocation", + "throughput_throttle", +}; + +/* User Specified Parameters. */ +unsigned long resume_firstblock; +dev_t resume_dev_t; +struct block_device *resume_block_device; +static atomic_t resume_bdev_open_count; + +struct block_device *header_block_device; + +/** + * toi_open_bdev: Open a bdev at resume time. + * + * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t + * (the user can have resume= pointing at a swap partition/file that isn't + * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the + * header. It will be from a swap partition that was enabled when we hibernated, + * but we don't know it's real index until we read that first page. + * dev_t: The device major/minor. + * display_errs: Whether to try to do this quietly. + * + * We stored a dev_t in the image header. Open the matching device without + * requiring /dev/ in most cases and record the details needed + * to close it later and avoid duplicating work. + */ +struct block_device *toi_open_bdev(char *uuid, dev_t default_device, + int display_errs) +{ + struct block_device *bdev; + dev_t device = default_device; + char buf[32]; + int retried = 0; + +retry: + if (uuid) { + struct fs_info seek; + strncpy((char *) &seek.uuid, uuid, 16); + seek.dev_t = 0; + seek.last_mount_size = 0; + device = blk_lookup_fs_info(&seek); + if (!device) { + device = default_device; + printk(KERN_DEBUG "Unable to resolve uuid. Falling back" + " to dev_t.\n"); + } else + printk(KERN_DEBUG "Resolved uuid to device %s.\n", + format_dev_t(buf, device)); + } + + if (!device) { + printk(KERN_ERR "TuxOnIce attempting to open a " + "blank dev_t!\n"); + dump_stack(); + return NULL; + } + bdev = toi_open_by_devnum(device); + + if (IS_ERR(bdev) || !bdev) { + if (!retried) { + retried = 1; + wait_for_device_probe(); + goto retry; + } + if (display_errs) + toi_early_boot_message(1, TOI_CONTINUE_REQ, + "Failed to get access to block device " + "\"%x\" (error %d).\n Maybe you need " + "to run mknod and/or lvmsetup in an " + "initrd/ramfs?", device, bdev); + return ERR_PTR(-EINVAL); + } + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "TuxOnIce got bdev %p for dev_t %x.", + bdev, device); + + return bdev; +} + +static void toi_bio_reserve_header_space(unsigned long request) +{ + header_pages_reserved = request; +} + +/** + * do_bio_wait - wait for some TuxOnIce I/O to complete + * @reason: The array index of the reason we're waiting. + * + * Wait for a particular page of I/O if we're after a particular page. + * If we're not after a particular page, wait instead for all in flight + * I/O to be completed or for us to have enough free memory to be able + * to submit more I/O. + * + * If we wait, we also update our statistics regarding why we waited. + **/ +static void do_bio_wait(int reason) +{ + struct page *was_waiting_on = waiting_on; + + /* On SMP, waiting_on can be reset, so we make a copy */ + if (was_waiting_on) { + wait_on_page_locked(was_waiting_on); + atomic_inc(&reasons[reason]); + } else { + atomic_inc(&reasons[reason]); + + wait_event(num_in_progress_wait, + !atomic_read(&toi_io_in_progress) || + nr_free_buffer_pages() > free_mem_throttle); + } +} + +/** + * throttle_if_needed - wait for I/O completion if throttle points are reached + * @flags: What to check and how to act. + * + * Check whether we need to wait for some I/O to complete. We always check + * whether we have enough memory available, but may also (depending upon + * @reason) check if the throughput throttle limit has been reached. + **/ +static int throttle_if_needed(int flags) +{ + int free_pages = nr_free_buffer_pages(); + + /* Getting low on memory and I/O is in progress? */ + while (unlikely(free_pages < free_mem_throttle) && + atomic_read(&toi_io_in_progress) && + !test_result_state(TOI_ABORTED)) { + if (!(flags & THROTTLE_WAIT)) + return -ENOMEM; + do_bio_wait(4); + free_pages = nr_free_buffer_pages(); + } + + while (!(flags & MEMORY_ONLY) && throughput_throttle && + TOTAL_OUTSTANDING_IO >= throughput_throttle && + !test_result_state(TOI_ABORTED)) { + int result = toi_bio_queue_flush_pages(0); + if (result) + return result; + atomic_inc(&reasons[6]); + wait_event(num_in_progress_wait, + !atomic_read(&toi_io_in_progress) || + TOTAL_OUTSTANDING_IO < throughput_throttle); + } + + return 0; +} + +/** + * update_throughput_throttle - update the raw throughput throttle + * @jif_index: The number of times this function has been called. + * + * This function is called four times per second by the core, and used to limit + * the amount of I/O we submit at once, spreading out our waiting through the + * whole job and letting userui get an opportunity to do its work. + * + * We don't start limiting I/O until 1/4s has gone so that we get a + * decent sample for our initial limit, and keep updating it because + * throughput may vary (on rotating media, eg) with our block number. + * + * We throttle to 1/10s worth of I/O. + **/ +static void update_throughput_throttle(int jif_index) +{ + int done = atomic_read(&toi_io_done); + throughput_throttle = done * 2 / 5 / jif_index; +} + +/** + * toi_finish_all_io - wait for all outstanding i/o to complete + * + * Flush any queued but unsubmitted I/O and wait for it all to complete. + **/ +static int toi_finish_all_io(void) +{ + int result = toi_bio_queue_flush_pages(0); + toi_bio_queue_flusher_should_finish = 1; + wake_up(&toi_io_queue_flusher); + wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO); + return result; +} + +/** + * toi_end_bio - bio completion function. + * @bio: bio that has completed. + * @err: Error value. Yes, like end_swap_bio_read, we ignore it. + * + * Function called by the block driver from interrupt context when I/O is + * completed. If we were writing the page, we want to free it and will have + * set bio->bi_private to the parameter we should use in telling the page + * allocation accounting code what the page was allocated for. If we're + * reading the page, it will be in the singly linked list made from + * page->private pointers. + **/ +static void toi_end_bio(struct bio *bio, int err) +{ + struct page *page = bio->bi_io_vec[0].bv_page; + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + + unlock_page(page); + bio_put(bio); + + if (waiting_on == page) + waiting_on = NULL; + + put_page(page); + + if (bio->bi_private) + toi__free_page((int) ((unsigned long) bio->bi_private) , page); + + bio_put(bio); + + atomic_dec(&toi_io_in_progress); + atomic_inc(&toi_io_done); + + wake_up(&num_in_progress_wait); +} + +/** + * submit - submit BIO request + * @writing: READ or WRITE. + * @dev: The block device we're using. + * @first_block: The first sector we're using. + * @page: The page being used for I/O. + * @free_group: If writing, the group that was used in allocating the page + * and which will be used in freeing the page from the completion + * routine. + * + * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the + * textbook - allocate and initialize the bio. If we're writing, make sure + * the page is marked as dirty. Then submit it and carry on." + * + * If we're just testing the speed of our own code, we fake having done all + * the hard work and all toi_end_bio immediately. + **/ +static int submit(int writing, struct block_device *dev, sector_t first_block, + struct page *page, int free_group) +{ + struct bio *bio = NULL; + int cur_outstanding_io, result; + + /* + * Shouldn't throttle if reading - can deadlock in the single + * threaded case as pages are only freed when we use the + * readahead. + */ + if (writing) { + result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT); + if (result) + return result; + } + + while (!bio) { + bio = bio_alloc(TOI_ATOMIC_GFP, 1); + if (!bio) { + set_free_mem_throttle(); + do_bio_wait(1); + } + } + + bio->bi_bdev = dev; + bio->bi_iter.bi_sector = first_block; + bio->bi_private = (void *) ((unsigned long) free_group); + bio->bi_end_io = toi_end_bio; + bio->bi_flags |= (1 << BIO_TOI); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n", + (unsigned long long) first_block); + bio_put(bio); + return -EFAULT; + } + + bio_get(bio); + + cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress); + if (writing) { + if (cur_outstanding_io > max_outstanding_writes) + max_outstanding_writes = cur_outstanding_io; + } else { + if (cur_outstanding_io > max_outstanding_reads) + max_outstanding_reads = cur_outstanding_io; + } + + /* Still read the header! */ + if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) { + /* Fake having done the hard work */ + set_bit(BIO_UPTODATE, &bio->bi_flags); + toi_end_bio(bio, 0); + } else + submit_bio(writing | REQ_SYNC, bio); + + return 0; +} + +/** + * toi_do_io: Prepare to do some i/o on a page and submit or batch it. + * + * @writing: Whether reading or writing. + * @bdev: The block device which we're using. + * @block0: The first sector we're reading or writing. + * @page: The page on which I/O is being done. + * @readahead_index: If doing readahead, the index (reset this flag when done). + * @syncio: Whether the i/o is being done synchronously. + * + * Prepare and start a read or write operation. + * + * Note that we always work with our own page. If writing, we might be given a + * compression buffer that will immediately be used to start compressing the + * next page. For reading, we do readahead and therefore don't know the final + * address where the data needs to go. + **/ +int toi_do_io(int writing, struct block_device *bdev, long block0, + struct page *page, int is_readahead, int syncio, int free_group) +{ + page->private = 0; + + /* Do here so we don't race against toi_bio_get_next_page_read */ + lock_page(page); + + if (is_readahead) { + if (readahead_list_head) + readahead_list_tail->private = (unsigned long) page; + else + readahead_list_head = page; + + readahead_list_tail = page; + } + + /* Done before submitting to avoid races. */ + if (syncio) + waiting_on = page; + + /* Submit the page */ + get_page(page); + + if (submit(writing, bdev, block0, page, free_group)) + return -EFAULT; + + if (syncio) + do_bio_wait(2); + + return 0; +} + +/** + * toi_bdev_page_io - simpler interface to do directly i/o on a single page + * @writing: Whether reading or writing. + * @bdev: Block device on which we're operating. + * @pos: Sector at which page to read or write starts. + * @page: Page to be read/written. + * + * A simple interface to submit a page of I/O and wait for its completion. + * The caller must free the page used. + **/ +static int toi_bdev_page_io(int writing, struct block_device *bdev, + long pos, struct page *page) +{ + return toi_do_io(writing, bdev, pos, page, 0, 1, 0); +} + +/** + * toi_bio_memory_needed - report the amount of memory needed for block i/o + * + * We want to have at least enough memory so as to have target_outstanding_io + * or more transactions on the fly at once. If we can do more, fine. + **/ +static int toi_bio_memory_needed(void) +{ + return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) + + sizeof(struct bio)); +} + +/** + * toi_bio_print_debug_stats - put out debugging info in the buffer provided + * @buffer: A buffer of size @size into which text should be placed. + * @size: The size of @buffer. + * + * Fill a buffer with debugging info. This is used for both our debug_info sysfs + * entry and for recording the same info in dmesg. + **/ +static int toi_bio_print_debug_stats(char *buffer, int size) +{ + int len = 0; + + if (toiActiveAllocator != &toi_blockwriter_ops) { + len = scnprintf(buffer, size, + "- Block I/O inactive.\n"); + return len; + } + + len = scnprintf(buffer, size, "- Block I/O active.\n"); + + len += toi_bio_chains_debug_info(buffer + len, size - len); + + len += scnprintf(buffer + len, size - len, + "- Max outstanding reads %d. Max writes %d.\n", + max_outstanding_reads, max_outstanding_writes); + + len += scnprintf(buffer + len, size - len, + " Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n", + target_outstanding_io, + PAGE_SIZE, (unsigned int) sizeof(struct request), + (unsigned int) sizeof(struct bio), toi_bio_memory_needed()); + +#ifdef MEASURE_MUTEX_CONTENTION + { + int i; + + len += scnprintf(buffer + len, size - len, + " Mutex contention while reading:\n Contended Free\n"); + + for_each_online_cpu(i) + len += scnprintf(buffer + len, size - len, + " %9lu %9lu\n", + mutex_times[0][0][i], mutex_times[0][1][i]); + + len += scnprintf(buffer + len, size - len, + " Mutex contention while writing:\n Contended Free\n"); + + for_each_online_cpu(i) + len += scnprintf(buffer + len, size - len, + " %9lu %9lu\n", + mutex_times[1][0][i], mutex_times[1][1][i]); + + } +#endif + + return len + scnprintf(buffer + len, size - len, + " Free mem throttle point reached %d.\n", free_mem_throttle); +} + +static int total_header_bytes; +static int unowned; + +void debug_broken_header(void) +{ + printk(KERN_DEBUG "Image header too big for size allocated!\n"); + print_toi_header_storage_for_modules(); + printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed()); + printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header)); + printk(KERN_DEBUG "Total unowned : %d.\n", unowned); + printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes, + DIV_ROUND_UP(total_header_bytes, PAGE_SIZE)); + printk(KERN_DEBUG "Space needed now : %ld.\n", + get_header_storage_needed()); + dump_block_chains(); + abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small."); +} + +static int toi_bio_update_previous_inc_img_ptr(int stream) +{ + int result; + char * buffer = (char *) toi_get_zeroed_page(12, TOI_ATOMIC_GFP); + struct page *page; + struct toi_incremental_image_pointer *prev, *this; + + prev = &toi_inc_ptr[stream][0]; + this = &toi_inc_ptr[stream][1]; + + if (!buffer) { + // We're at the start of writing a pageset. Memory should not be that scarce. + return -ENOMEM; + } + + page = virt_to_page(buffer); + result = toi_do_io(READ, prev->bdev, prev->block, page, 0, 1, 0); + + if (result) + goto out; + + memcpy(buffer, (char *) this, sizeof(this->save)); + + result = toi_do_io(WRITE, prev->bdev, prev->block, page, 0, 0, 12); + + // If the IO is successfully submitted (!result), the page will be freed + // asynchronously on completion. +out: + if (result) + toi__free_page(12, virt_to_page(buffer)); + return result; +} + +/** + * toi_rw_init_incremental - incremental image part of setting up to write new section + */ +static int toi_write_init_incremental(int stream) +{ + int result = 0; + + // Remember the location of this block so we can link to it. + toi_bio_store_inc_image_ptr(&toi_inc_ptr[stream][1]); + + // Update the pointer at the start of the last pageset with the same stream number. + result = toi_bio_update_previous_inc_img_ptr(stream); + if (result) + return result; + + // Move the current to the previous slot. + memcpy(&toi_inc_ptr[stream][0], &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1])); + + // Store a blank pointer at the start of this incremental pageset + memset(&toi_inc_ptr[stream][1], 0, sizeof(toi_inc_ptr[stream][1])); + result = toi_rw_buffer(WRITE, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0); + if (result) + return result; + + // Serialise extent chains if this is an incremental pageset + return toi_serialise_extent_chains(); +} + +/** + * toi_read_init_incremental - incremental image part of setting up to read new section + */ +static int toi_read_init_incremental(int stream) +{ + int result; + + // Set our position to the start of the next pageset + toi_bio_restore_inc_image_ptr(&toi_inc_ptr[stream][1]); + + // Read the start of the next incremental pageset (if any) + result = toi_rw_buffer(READ, (char *) &toi_inc_ptr[stream][1], sizeof(toi_inc_ptr[stream][1]), 0); + + if (!result) + result = toi_load_extent_chains(); + + return result; +} + +/** + * toi_rw_init - prepare to read or write a stream in the image + * @writing: Whether reading or writing. + * @stream number: Section of the image being processed. + * + * Prepare to read or write a section ('stream') in the image. + **/ +static int toi_rw_init(int writing, int stream_number) +{ + if (stream_number) + toi_extent_state_restore(stream_number); + else + toi_extent_state_goto_start(); + + if (writing) { + reset_idx = 0; + if (!current_stream) + page_idx = 0; + } else { + reset_idx = 1; + } + + atomic_set(&toi_io_done, 0); + if (!toi_writer_buffer) + toi_writer_buffer = (char *) toi_get_zeroed_page(11, + TOI_ATOMIC_GFP); + toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE; + + current_stream = stream_number; + + more_readahead = 1; + + if (test_result_state(TOI_KEPT_IMAGE)) { + int result; + + if (writing) { + result = toi_write_init_incremental(stream_number); + } else { + result = toi_read_init_incremental(stream_number); + } + + if (result) + return result; + } + + return toi_writer_buffer ? 0 : -ENOMEM; +} + +/** + * toi_bio_queue_write - queue a page for writing + * @full_buffer: Pointer to a page to be queued + * + * Add a page to the queue to be submitted. If we're the queue flusher, + * we'll do this once we've dropped toi_bio_mutex, so other threads can + * continue to submit I/O while we're on the slow path doing the actual + * submission. + **/ +static void toi_bio_queue_write(char **full_buffer) +{ + struct page *page = virt_to_page(*full_buffer); + unsigned long flags; + + *full_buffer = NULL; + page->private = 0; + + spin_lock_irqsave(&bio_queue_lock, flags); + if (!bio_queue_head) + bio_queue_head = page; + else + bio_queue_tail->private = (unsigned long) page; + + bio_queue_tail = page; + atomic_inc(&toi_bio_queue_size); + + spin_unlock_irqrestore(&bio_queue_lock, flags); + wake_up(&toi_io_queue_flusher); +} + +/** + * toi_rw_cleanup - Cleanup after i/o. + * @writing: Whether we were reading or writing. + * + * Flush all I/O and clean everything up after reading or writing a + * section of the image. + **/ +static int toi_rw_cleanup(int writing) +{ + int i, result = 0; + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup."); + if (writing) { + if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED)) + toi_bio_queue_write(&toi_writer_buffer); + + while (bio_queue_head && !result) + result = toi_bio_queue_flush_pages(0); + + if (result) + return result; + + if (current_stream == 2) + toi_extent_state_save(1); + else if (current_stream == 1) + toi_extent_state_save(3); + } + + result = toi_finish_all_io(); + + while (readahead_list_head) { + void *next = (void *) readahead_list_head->private; + toi__free_page(12, readahead_list_head); + readahead_list_head = next; + } + + readahead_list_tail = NULL; + + if (!current_stream) + return result; + + for (i = 0; i < NUM_REASONS; i++) { + if (!atomic_read(&reasons[i])) + continue; + printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n", + reason_name[i], atomic_read(&reasons[i])); + atomic_set(&reasons[i], 0); + } + + current_stream = 0; + return result; +} + +/** + * toi_start_one_readahead - start one page of readahead + * @dedicated_thread: Is this a thread dedicated to doing readahead? + * + * Start one new page of readahead. If this is being called by a thread + * whose only just is to submit readahead, don't quit because we failed + * to allocate a page. + **/ +static int toi_start_one_readahead(int dedicated_thread) +{ + char *buffer = NULL; + int oom = 0, result; + + result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0); + if (result) { + printk("toi_start_one_readahead: throttle_if_needed returned %d.\n", result); + return result; + } + + mutex_lock(&toi_bio_readahead_mutex); + + while (!buffer) { + buffer = (char *) toi_get_zeroed_page(12, + TOI_ATOMIC_GFP); + if (!buffer) { + if (oom && !dedicated_thread) { + mutex_unlock(&toi_bio_readahead_mutex); + printk("toi_start_one_readahead: oom and !dedicated thread %d.\n", result); + return -ENOMEM; + } + + oom = 1; + set_free_mem_throttle(); + do_bio_wait(5); + } + } + + result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0); + if (result) { + printk("toi_start_one_readahead: toi_bio_rw_page returned %d.\n", result); + } + if (result == -ENOSPC) + toi__free_page(12, virt_to_page(buffer)); + mutex_unlock(&toi_bio_readahead_mutex); + if (result) { + if (result == -ENOSPC) + toi_message(TOI_BIO, TOI_VERBOSE, 0, + "Last readahead page submitted."); + else + printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n", + result); + } + return result; +} + +/** + * toi_start_new_readahead - start new readahead + * @dedicated_thread: Are we dedicated to this task? + * + * Start readahead of image pages. + * + * We can be called as a thread dedicated to this task (may be helpful on + * systems with lots of CPUs), in which case we don't exit until there's no + * more readahead. + * + * If this is not called by a dedicated thread, we top up our queue until + * there's no more readahead to submit, we've submitted the number given + * in target_outstanding_io or the number in progress exceeds the target + * outstanding I/O value. + * + * No mutex needed because this is only ever called by the first cpu. + **/ +static int toi_start_new_readahead(int dedicated_thread) +{ + int last_result, num_submitted = 0; + + /* Start a new readahead? */ + if (!more_readahead) + return 0; + + do { + last_result = toi_start_one_readahead(dedicated_thread); + + if (last_result) { + if (last_result == -ENOMEM || last_result == -ENOSPC) + return 0; + + printk(KERN_DEBUG + "Begin read chunk returned %d.\n", + last_result); + } else + num_submitted++; + + } while (more_readahead && !last_result && + (dedicated_thread || + (num_submitted < target_outstanding_io && + atomic_read(&toi_io_in_progress) < target_outstanding_io))); + + return last_result; +} + +/** + * bio_io_flusher - start the dedicated I/O flushing routine + * @writing: Whether we're writing the image. + **/ +static int bio_io_flusher(int writing) +{ + + if (writing) + return toi_bio_queue_flush_pages(1); + else + return toi_start_new_readahead(1); +} + +/** + * toi_bio_get_next_page_read - read a disk page, perhaps with readahead + * @no_readahead: Whether we can use readahead + * + * Read a page from disk, submitting readahead and cleaning up finished i/o + * while we wait for the page we're after. + **/ +static int toi_bio_get_next_page_read(int no_readahead) +{ + char *virt; + struct page *old_readahead_list_head; + + /* + * When reading the second page of the header, we have to + * delay submitting the read until after we've gotten the + * extents out of the first page. + */ + if (unlikely(no_readahead)) { + int result = toi_start_one_readahead(0); + if (result) { + printk(KERN_EMERG "No readahead and toi_start_one_readahead " + "returned non-zero.\n"); + return -EIO; + } + } + + if (unlikely(!readahead_list_head)) { + /* + * If the last page finishes exactly on the page + * boundary, we will be called one extra time and + * have no data to return. In this case, we should + * not BUG(), like we used to! + */ + if (!more_readahead) { + printk(KERN_EMERG "No more readahead.\n"); + return -ENOSPC; + } + if (unlikely(toi_start_one_readahead(0))) { + printk(KERN_EMERG "No readahead and " + "toi_start_one_readahead returned non-zero.\n"); + return -EIO; + } + } + + if (PageLocked(readahead_list_head)) { + waiting_on = readahead_list_head; + do_bio_wait(0); + } + + virt = page_address(readahead_list_head); + memcpy(toi_writer_buffer, virt, PAGE_SIZE); + + mutex_lock(&toi_bio_readahead_mutex); + old_readahead_list_head = readahead_list_head; + readahead_list_head = (struct page *) readahead_list_head->private; + mutex_unlock(&toi_bio_readahead_mutex); + toi__free_page(12, old_readahead_list_head); + return 0; +} + +/** + * toi_bio_queue_flush_pages - flush the queue of pages queued for writing + * @dedicated_thread: Whether we're a dedicated thread + * + * Flush the queue of pages ready to be written to disk. + * + * If we're a dedicated thread, stay in here until told to leave, + * sleeping in wait_event. + * + * The first thread is normally the only one to come in here. Another + * thread can enter this routine too, though, via throttle_if_needed. + * Since that's the case, we must be careful to only have one thread + * doing this work at a time. Otherwise we have a race and could save + * pages out of order. + * + * If an error occurs, free all remaining pages without submitting them + * for I/O. + **/ + +int toi_bio_queue_flush_pages(int dedicated_thread) +{ + unsigned long flags; + int result = 0; + static DEFINE_MUTEX(busy); + + if (!mutex_trylock(&busy)) + return 0; + +top: + spin_lock_irqsave(&bio_queue_lock, flags); + while (bio_queue_head) { + struct page *page = bio_queue_head; + bio_queue_head = (struct page *) page->private; + if (bio_queue_tail == page) + bio_queue_tail = NULL; + atomic_dec(&toi_bio_queue_size); + spin_unlock_irqrestore(&bio_queue_lock, flags); + + /* Don't generate more error messages if already had one */ + if (!result) + result = toi_bio_rw_page(WRITE, page, 0, 11); + /* + * If writing the page failed, don't drop out. + * Flush the rest of the queue too. + */ + if (result) + toi__free_page(11 , page); + spin_lock_irqsave(&bio_queue_lock, flags); + } + spin_unlock_irqrestore(&bio_queue_lock, flags); + + if (dedicated_thread) { + wait_event(toi_io_queue_flusher, bio_queue_head || + toi_bio_queue_flusher_should_finish); + if (likely(!toi_bio_queue_flusher_should_finish)) + goto top; + toi_bio_queue_flusher_should_finish = 0; + } + + mutex_unlock(&busy); + return result; +} + +/** + * toi_bio_get_new_page - get a new page for I/O + * @full_buffer: Pointer to a page to allocate. + **/ +static int toi_bio_get_new_page(char **full_buffer) +{ + int result = throttle_if_needed(THROTTLE_WAIT); + if (result) + return result; + + while (!*full_buffer) { + *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP); + if (!*full_buffer) { + set_free_mem_throttle(); + do_bio_wait(3); + } + } + + return 0; +} + +/** + * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O + * @writing: Bool - whether writing (or reading). + * @buffer: The start of the buffer to write or fill. + * @buffer_size: The size of the buffer to write or fill. + * @no_readahead: Don't try to start readhead (when getting extents). + **/ +static int toi_rw_buffer(int writing, char *buffer, int buffer_size, + int no_readahead) +{ + int bytes_left = buffer_size, result = 0; + + while (bytes_left) { + char *source_start = buffer + buffer_size - bytes_left; + char *dest_start = toi_writer_buffer + toi_writer_buffer_posn; + int capacity = PAGE_SIZE - toi_writer_buffer_posn; + char *to = writing ? dest_start : source_start; + char *from = writing ? source_start : dest_start; + + if (bytes_left <= capacity) { + memcpy(to, from, bytes_left); + toi_writer_buffer_posn += bytes_left; + return 0; + } + + /* Complete this page and start a new one */ + memcpy(to, from, capacity); + bytes_left -= capacity; + + if (!writing) { + /* + * Perform actual I/O: + * read readahead_list_head into toi_writer_buffer + */ + int result = toi_bio_get_next_page_read(no_readahead); + if (result && bytes_left) { + printk("toi_bio_get_next_page_read " + "returned %d. Expecting to read %d bytes.\n", result, bytes_left); + return result; + } + } else { + toi_bio_queue_write(&toi_writer_buffer); + result = toi_bio_get_new_page(&toi_writer_buffer); + if (result) { + printk(KERN_ERR "toi_bio_get_new_page returned " + "%d.\n", result); + return result; + } + } + + toi_writer_buffer_posn = 0; + toi_cond_pause(0, NULL); + } + + return 0; +} + +/** + * toi_bio_read_page - read a page of the image + * @pfn: The pfn where the data belongs. + * @buffer_page: The page containing the (possibly compressed) data. + * @buf_size: The number of bytes on @buffer_page used (PAGE_SIZE). + * + * Read a (possibly compressed) page from the image, into buffer_page, + * returning its pfn and the buffer size. + **/ +static int toi_bio_read_page(unsigned long *pfn, int buf_type, + void *buffer_page, unsigned int *buf_size) +{ + int result = 0; + int this_idx; + char *buffer_virt = TOI_MAP(buf_type, buffer_page); + + /* + * Only call start_new_readahead if we don't have a dedicated thread + * and we're the queue flusher. + */ + if (current == toi_queue_flusher && more_readahead && + !test_action_state(TOI_NO_READAHEAD)) { + int result2 = toi_start_new_readahead(0); + if (result2) { + printk(KERN_DEBUG "Queue flusher and " + "toi_start_one_readahead returned non-zero.\n"); + result = -EIO; + goto out; + } + } + + my_mutex_lock(0, &toi_bio_mutex); + + /* + * Structure in the image: + * [destination pfn|page size|page data] + * buf_size is PAGE_SIZE + * We can validly find there's nothing to read in a multithreaded + * situation. + */ + if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) || + toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) || + toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) || + toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) { + result = -ENODATA; + goto out_unlock; + } + + if (reset_idx) { + page_idx = this_idx; + reset_idx = 0; + } else { + page_idx++; + if (!this_idx) + result = -ENODATA; + else if (page_idx != this_idx) + printk(KERN_ERR "Got page index %d, expected %d.\n", + this_idx, page_idx); + } + +out_unlock: + my_mutex_unlock(0, &toi_bio_mutex); +out: + TOI_UNMAP(buf_type, buffer_page); + return result; +} + +/** + * toi_bio_write_page - write a page of the image + * @pfn: The pfn where the data belongs. + * @buffer_page: The page containing the (possibly compressed) data. + * @buf_size: The number of bytes on @buffer_page used. + * + * Write a (possibly compressed) page to the image from the buffer, together + * with it's index and buffer size. + **/ +static int toi_bio_write_page(unsigned long pfn, int buf_type, + void *buffer_page, unsigned int buf_size) +{ + char *buffer_virt; + int result = 0, result2 = 0; + + if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED))) + return 0; + + my_mutex_lock(1, &toi_bio_mutex); + + if (test_result_state(TOI_ABORTED)) { + my_mutex_unlock(1, &toi_bio_mutex); + return 0; + } + + buffer_virt = TOI_MAP(buf_type, buffer_page); + page_idx++; + + /* + * Structure in the image: + * [destination pfn|page size|page data] + * buf_size is PAGE_SIZE + */ + if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) || + toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) || + toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) || + toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) { + printk(KERN_DEBUG "toi_rw_buffer returned non-zero to " + "toi_bio_write_page.\n"); + result = -EIO; + } + + TOI_UNMAP(buf_type, buffer_page); + my_mutex_unlock(1, &toi_bio_mutex); + + if (current == toi_queue_flusher) + result2 = toi_bio_queue_flush_pages(0); + + return result ? result : result2; +} + +/** + * _toi_rw_header_chunk - read or write a portion of the image header + * @writing: Whether reading or writing. + * @owner: The module for which we're writing. + * Used for confirming that modules + * don't use more header space than they asked for. + * @buffer: Address of the data to write. + * @buffer_size: Size of the data buffer. + * @no_readahead: Don't try to start readhead (when getting extents). + * + * Perform PAGE_SIZE I/O. Start readahead if needed. + **/ +static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner, + char *buffer, int buffer_size, int no_readahead) +{ + int result = 0; + + if (owner) { + owner->header_used += buffer_size; + toi_message(TOI_HEADER, TOI_LOW, 1, + "Header: %s : %d bytes (%d/%d) from offset %d.", + owner->name, + buffer_size, owner->header_used, + owner->header_requested, + toi_writer_buffer_posn); + if (owner->header_used > owner->header_requested && writing) { + printk(KERN_EMERG "TuxOnIce module %s is using more " + "header space (%u) than it requested (%u).\n", + owner->name, + owner->header_used, + owner->header_requested); + return buffer_size; + } + } else { + unowned += buffer_size; + toi_message(TOI_HEADER, TOI_LOW, 1, + "Header: (No owner): %d bytes (%d total so far) from " + "offset %d.", buffer_size, unowned, + toi_writer_buffer_posn); + } + + if (!writing && !no_readahead && more_readahead) { + result = toi_start_new_readahead(0); + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead " + "returned %d.", result); + } + + if (!result) { + result = toi_rw_buffer(writing, buffer, buffer_size, + no_readahead); + toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned " + "%d.", result); + } + + total_header_bytes += buffer_size; + toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning " + "%d.", result); + return result; +} + +static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner, + char *buffer, int size) +{ + return _toi_rw_header_chunk(writing, owner, buffer, size, 1); +} + +static int toi_rw_header_chunk_noreadahead(int writing, + struct toi_module_ops *owner, char *buffer, int size) +{ + return _toi_rw_header_chunk(writing, owner, buffer, size, 1); +} + +/** + * toi_bio_storage_needed - get the amount of storage needed for my fns + **/ +static int toi_bio_storage_needed(void) +{ + return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed(); +} + +/** + * toi_bio_save_config_info - save block I/O config to image header + * @buf: PAGE_SIZE'd buffer into which data should be saved. + **/ +static int toi_bio_save_config_info(char *buf) +{ + int *ints = (int *) buf; + ints[0] = target_outstanding_io; + return sizeof(int); +} + +/** + * toi_bio_load_config_info - restore block I/O config + * @buf: Data to be reloaded. + * @size: Size of the buffer saved. + **/ +static void toi_bio_load_config_info(char *buf, int size) +{ + int *ints = (int *) buf; + target_outstanding_io = ints[0]; +} + +void close_resume_dev_t(int force) +{ + if (!resume_block_device) + return; + + if (force) + atomic_set(&resume_bdev_open_count, 0); + else + atomic_dec(&resume_bdev_open_count); + + if (!atomic_read(&resume_bdev_open_count)) { + toi_close_bdev(resume_block_device); + resume_block_device = NULL; + } +} + +int open_resume_dev_t(int force, int quiet) +{ + if (force) { + close_resume_dev_t(1); + atomic_set(&resume_bdev_open_count, 1); + } else + atomic_inc(&resume_bdev_open_count); + + if (resume_block_device) + return 0; + + resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0); + if (IS_ERR(resume_block_device)) { + if (!quiet) + toi_early_boot_message(1, TOI_CONTINUE_REQ, + "Failed to open device %x, where" + " the header should be found.", + resume_dev_t); + resume_block_device = NULL; + atomic_set(&resume_bdev_open_count, 0); + return 1; + } + + return 0; +} + +/** + * toi_bio_initialise - initialise bio code at start of some action + * @starting_cycle: Whether starting a hibernation cycle, or just reading or + * writing a sysfs value. + **/ +static int toi_bio_initialise(int starting_cycle) +{ + int result; + + if (!starting_cycle || !resume_dev_t) + return 0; + + max_outstanding_writes = 0; + max_outstanding_reads = 0; + current_stream = 0; + toi_queue_flusher = current; +#ifdef MEASURE_MUTEX_CONTENTION + { + int i, j, k; + + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for_each_online_cpu(k) + mutex_times[i][j][k] = 0; + } +#endif + result = open_resume_dev_t(0, 1); + + if (result) + return result; + + return get_signature_page(); +} + +static unsigned long raw_to_real(unsigned long raw) +{ + unsigned long extra; + + extra = (raw * (sizeof(unsigned long) + sizeof(int)) + + (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) / + (PAGE_SIZE + sizeof(unsigned long) + sizeof(int)); + + return raw > extra ? raw - extra : 0; +} + +static unsigned long toi_bio_storage_available(void) +{ + unsigned long sum = 0; + struct toi_module_ops *this_module; + + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!this_module->enabled || + this_module->type != BIO_ALLOCATOR_MODULE) + continue; + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage " + "available from %s.", this_module->name); + sum += this_module->bio_allocator_ops->storage_available(); + } + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu " + "pages (%d header pages).", sum, header_pages_reserved); + + return sum > header_pages_reserved ? + raw_to_real(sum - header_pages_reserved) : 0; + +} + +static unsigned long toi_bio_storage_allocated(void) +{ + return raw_pages_allocd > header_pages_reserved ? + raw_to_real(raw_pages_allocd - header_pages_reserved) : 0; +} + +/* + * If we have read part of the image, we might have filled memory with + * data that should be zeroed out. + */ +static void toi_bio_noresume_reset(void) +{ + toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset."); + toi_rw_cleanup(READ); + free_all_bdev_info(); +} + +/** + * toi_bio_cleanup - cleanup after some action + * @finishing_cycle: Whether completing a cycle. + **/ +static void toi_bio_cleanup(int finishing_cycle) +{ + if (!finishing_cycle) + return; + + if (toi_writer_buffer) { + toi_free_page(11, (unsigned long) toi_writer_buffer); + toi_writer_buffer = NULL; + } + + forget_signature_page(); + + if (header_block_device && toi_sig_data && + toi_sig_data->header_dev_t != resume_dev_t) + toi_close_bdev(header_block_device); + + header_block_device = NULL; + + close_resume_dev_t(0); +} + +static int toi_bio_write_header_init(void) +{ + int result; + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init"); + toi_rw_init(WRITE, 0); + toi_writer_buffer_posn = 0; + + /* Info needed to bootstrap goes at the start of the header. + * First we save the positions and devinfo, including the number + * of header pages. Then we save the structs containing data needed + * for reading the header pages back. + * Note that even if header pages take more than one page, when we + * read back the info, we will have restored the location of the + * next header page by the time we go to use it. + */ + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains."); + result = toi_serialise_extent_chains(); + + if (result) + return result; + + /* + * Signature page hasn't been modified at this point. Write it in + * the header so we can restore it later. + */ + toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page."); + return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops, + (char *) toi_cur_sig_page, + PAGE_SIZE); +} + +static int toi_bio_write_header_cleanup(void) +{ + int result = 0; + + if (toi_writer_buffer_posn) + toi_bio_queue_write(&toi_writer_buffer); + + result = toi_finish_all_io(); + + unowned = 0; + total_header_bytes = 0; + + /* Set signature to save we have an image */ + if (!result) + result = toi_bio_mark_have_image(); + + return result; +} + +/* + * toi_bio_read_header_init() + * + * Description: + * 1. Attempt to read the device specified with resume=. + * 2. Check the contents of the swap header for our signature. + * 3. Warn, ignore, reset and/or continue as appropriate. + * 4. If continuing, read the toi_swap configuration section + * of the header and set up block device info so we can read + * the rest of the header & image. + * + * Returns: + * May not return if user choose to reboot at a warning. + * -EINVAL if cannot resume at this time. Booting should continue + * normally. + */ + +static int toi_bio_read_header_init(void) +{ + int result = 0; + char buf[32]; + + toi_writer_buffer_posn = 0; + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init"); + + if (!toi_sig_data) { + printk(KERN_INFO "toi_bio_read_header_init called when we " + "haven't verified there is an image!\n"); + return -EINVAL; + } + + /* + * If the header is not on the resume_swap_dev_t, get the resume device + * first. + */ + toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.", + toi_sig_data->header_dev_t); + if (toi_sig_data->have_uuid) { + struct fs_info seek; + dev_t device; + + strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16); + seek.dev_t = toi_sig_data->header_dev_t; + seek.last_mount_size = 0; + device = blk_lookup_fs_info(&seek); + if (device) { + printk("Using dev_t %s, returned by blk_lookup_fs_info.\n", + format_dev_t(buf, device)); + toi_sig_data->header_dev_t = device; + } + } + if (toi_sig_data->header_dev_t != resume_dev_t) { + header_block_device = toi_open_bdev(NULL, + toi_sig_data->header_dev_t, 1); + + if (IS_ERR(header_block_device)) + return PTR_ERR(header_block_device); + } else + header_block_device = resume_block_device; + + if (!toi_writer_buffer) + toi_writer_buffer = (char *) toi_get_zeroed_page(11, + TOI_ATOMIC_GFP); + more_readahead = 1; + + /* + * Read toi_swap configuration. + * Headerblock size taken into account already. + */ + result = toi_bio_ops.bdev_page_io(READ, header_block_device, + toi_sig_data->first_header_block, + virt_to_page((unsigned long) toi_writer_buffer)); + if (result) + return result; + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains."); + result = toi_load_extent_chains(); + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page."); + toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); + if (!toi_orig_sig_page) { + printk(KERN_ERR "Failed to allocate memory for the current" + " image signature.\n"); + return -ENOMEM; + } + + return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops, + (char *) toi_orig_sig_page, + PAGE_SIZE); +} + +static int toi_bio_read_header_cleanup(void) +{ + toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup."); + return toi_rw_cleanup(READ); +} + +/* Works only for digits and letters, but small and fast */ +#define TOLOWER(x) ((x) | 0x20) + +/* + * UUID must be 32 chars long. It may have dashes, but nothing + * else. + */ +char *uuid_from_commandline(char *commandline) +{ + int low = 0; + char *result = NULL, *output, *ptr; + + if (strncmp(commandline, "UUID=", 5)) + return NULL; + + result = kzalloc(17, GFP_KERNEL); + if (!result) { + printk("Failed to kzalloc UUID text memory.\n"); + return NULL; + } + + ptr = commandline + 5; + output = result; + + while (*ptr && (output - result) < 16) { + if (isxdigit(*ptr)) { + int value = isdigit(*ptr) ? *ptr - '0' : + TOLOWER(*ptr) - 'a' + 10; + if (low) { + *output += value; + output++; + } else { + *output = value << 4; + } + low = !low; + } else if (*ptr != '-') + break; + ptr++; + } + + if ((output - result) < 16 || *ptr) { + printk(KERN_DEBUG "Found resume=UUID=, but the value looks " + "invalid.\n"); + kfree(result); + result = NULL; + } + + return result; +} + +#define retry_if_fails(command) \ +do { \ + command; \ + if (!resume_dev_t && !waited_for_device_probe) { \ + wait_for_device_probe(); \ + command; \ + waited_for_device_probe = 1; \ + } \ +} while(0) + +/** + * try_to_open_resume_device: Try to parse and open resume= + * + * Any "swap:" has been stripped away and we just have the path to deal with. + * We attempt to do name_to_dev_t, open and stat the file. Having opened the + * file, get the struct block_device * to match. + */ +static int try_to_open_resume_device(char *commandline, int quiet) +{ + struct kstat stat; + int error = 0; + char *uuid = uuid_from_commandline(commandline); + int waited_for_device_probe = 0; + + resume_dev_t = MKDEV(0, 0); + + if (!strlen(commandline)) + retry_if_fails(toi_bio_scan_for_image(quiet)); + + if (uuid) { + struct fs_info seek; + strncpy((char *) &seek.uuid, uuid, 16); + seek.dev_t = resume_dev_t; + seek.last_mount_size = 0; + retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek)); + kfree(uuid); + } + + if (!resume_dev_t) + retry_if_fails(resume_dev_t = name_to_dev_t(commandline)); + + if (!resume_dev_t) { + struct file *file = filp_open(commandline, + O_RDONLY|O_LARGEFILE, 0); + + if (!IS_ERR(file) && file) { + vfs_getattr(&file->f_path, &stat); + filp_close(file, NULL); + } else + error = vfs_stat(commandline, &stat); + if (!error) + resume_dev_t = stat.rdev; + } + + if (!resume_dev_t) { + if (quiet) + return 1; + + if (test_toi_state(TOI_TRYING_TO_RESUME)) + toi_early_boot_message(1, toi_translate_err_default, + "Failed to translate \"%s\" into a device id.\n", + commandline); + else + printk("TuxOnIce: Can't translate \"%s\" into a device " + "id yet.\n", commandline); + return 1; + } + + return open_resume_dev_t(1, quiet); +} + +/* + * Parse Image Location + * + * Attempt to parse a resume= parameter. + * Swap Writer accepts: + * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE] + * + * Where: + * DEVNAME is convertable to a dev_t by name_to_dev_t + * FIRSTBLOCK is the location of the first block in the swap file + * (specifying for a swap partition is nonsensical but not prohibited). + * Data is validated by attempting to read a swap header from the + * location given. Failure will result in toi_swap refusing to + * save an image, and a reboot with correct parameters will be + * necessary. + */ +static int toi_bio_parse_sig_location(char *commandline, + int only_allocator, int quiet) +{ + char *thischar, *devstart, *colon = NULL; + int signature_found, result = -EINVAL, temp_result = 0; + + if (strncmp(commandline, "swap:", 5) && + strncmp(commandline, "file:", 5)) { + /* + * Failing swap:, we'll take a simple resume=/dev/hda2, or a + * blank value (scan) but fall through to other allocators + * if /dev/ or UUID= isn't matched. + */ + if (strncmp(commandline, "/dev/", 5) && + strncmp(commandline, "UUID=", 5) && + strlen(commandline)) + return 1; + } else + commandline += 5; + + devstart = commandline; + thischar = commandline; + while ((*thischar != ':') && (*thischar != '@') && + ((thischar - commandline) < 250) && (*thischar)) + thischar++; + + if (*thischar == ':') { + colon = thischar; + *colon = 0; + thischar++; + } + + while ((thischar - commandline) < 250 && *thischar) + thischar++; + + if (colon) { + unsigned long block; + temp_result = kstrtoul(colon + 1, 0, &block); + if (!temp_result) + resume_firstblock = (int) block; + } else + resume_firstblock = 0; + + clear_toi_state(TOI_CAN_HIBERNATE); + clear_toi_state(TOI_CAN_RESUME); + + if (!temp_result) + temp_result = try_to_open_resume_device(devstart, quiet); + + if (colon) + *colon = ':'; + + /* No error if we only scanned */ + if (temp_result) + return strlen(commandline) ? -EINVAL : 1; + + signature_found = toi_bio_image_exists(quiet); + + if (signature_found != -1) { + result = 0; + /* + * TODO: If only file storage, CAN_HIBERNATE should only be + * set if file allocator's target is valid. + */ + set_toi_state(TOI_CAN_HIBERNATE); + set_toi_state(TOI_CAN_RESUME); + } else + if (!quiet) + printk(KERN_ERR "TuxOnIce: Block I/O: No " + "signature found at %s.\n", devstart); + + return result; +} + +static void toi_bio_release_storage(void) +{ + header_pages_reserved = 0; + raw_pages_allocd = 0; + + free_all_bdev_info(); +} + +/* toi_swap_remove_image + * + */ +static int toi_bio_remove_image(void) +{ + int result; + + toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image."); + + result = toi_bio_restore_original_signature(); + + /* + * We don't do a sanity check here: we want to restore the swap + * whatever version of kernel made the hibernate image. + * + * We need to write swap, but swap may not be enabled so + * we write the device directly + * + * If we don't have an current_signature_page, we didn't + * read an image header, so don't change anything. + */ + + toi_bio_release_storage(); + + return result; +} + +struct toi_bio_ops toi_bio_ops = { + .bdev_page_io = toi_bdev_page_io, + .register_storage = toi_register_storage_chain, + .free_storage = toi_bio_release_storage, +}; + +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io, + 0, 16384, 0, NULL), +}; + +struct toi_module_ops toi_blockwriter_ops = { + .type = WRITER_MODULE, + .name = "block i/o", + .directory = "block_io", + .module = THIS_MODULE, + .memory_needed = toi_bio_memory_needed, + .print_debug_info = toi_bio_print_debug_stats, + .storage_needed = toi_bio_storage_needed, + .save_config_info = toi_bio_save_config_info, + .load_config_info = toi_bio_load_config_info, + .initialise = toi_bio_initialise, + .cleanup = toi_bio_cleanup, + .post_atomic_restore = toi_bio_chains_post_atomic, + + .rw_init = toi_rw_init, + .rw_cleanup = toi_rw_cleanup, + .read_page = toi_bio_read_page, + .write_page = toi_bio_write_page, + .rw_header_chunk = toi_rw_header_chunk, + .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead, + .io_flusher = bio_io_flusher, + .update_throughput_throttle = update_throughput_throttle, + .finish_all_io = toi_finish_all_io, + + .noresume_reset = toi_bio_noresume_reset, + .storage_available = toi_bio_storage_available, + .storage_allocated = toi_bio_storage_allocated, + .reserve_header_space = toi_bio_reserve_header_space, + .allocate_storage = toi_bio_allocate_storage, + .free_unused_storage = toi_bio_free_unused_storage, + .image_exists = toi_bio_image_exists, + .mark_resume_attempted = toi_bio_mark_resume_attempted, + .write_header_init = toi_bio_write_header_init, + .write_header_cleanup = toi_bio_write_header_cleanup, + .read_header_init = toi_bio_read_header_init, + .read_header_cleanup = toi_bio_read_header_cleanup, + .get_header_version = toi_bio_get_header_version, + .remove_image = toi_bio_remove_image, + .parse_sig_location = toi_bio_parse_sig_location, + + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +/** + * toi_block_io_load - load time routine for block I/O module + * + * Register block i/o ops and sysfs entries. + **/ +static __init int toi_block_io_load(void) +{ + return toi_register_module(&toi_blockwriter_ops); +} + +late_initcall(toi_block_io_load); diff -Nur linux-4.2/kernel/power/tuxonice_bio_internal.h linux-4.2-pck/kernel/power/tuxonice_bio_internal.h --- linux-4.2/kernel/power/tuxonice_bio_internal.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_bio_internal.h 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,101 @@ +/* + * kernel/power/tuxonice_bio_internal.h + * + * Copyright (C) 2009-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * Distributed under GPLv2. + * + * This file contains declarations for functions exported from + * tuxonice_bio.c, which contains low level io functions. + */ + +/* Extent chains */ +void toi_extent_state_goto_start(void); +void toi_extent_state_save(int slot); +int go_next_page(int writing, int section_barrier); +void toi_extent_state_restore(int slot); +void free_all_bdev_info(void); +int devices_of_same_priority(struct toi_bdev_info *this); +int toi_register_storage_chain(struct toi_bdev_info *new); +int toi_serialise_extent_chains(void); +int toi_load_extent_chains(void); +int toi_bio_rw_page(int writing, struct page *page, int is_readahead, + int free_group); +int toi_bio_restore_original_signature(void); +int toi_bio_devinfo_storage_needed(void); +unsigned long get_headerblock(void); +dev_t get_header_dev_t(void); +struct block_device *get_header_bdev(void); +int toi_bio_allocate_storage(unsigned long request); +void toi_bio_free_unused_storage(void); + +/* Signature functions */ +#define HaveImage "HaveImage" +#define NoImage "TuxOnIce" +#define sig_size (sizeof(HaveImage)) + +struct sig_data { + char sig[sig_size]; + int have_image; + int resumed_before; + + char have_uuid; + char header_uuid[17]; + dev_t header_dev_t; + unsigned long first_header_block; + + /* Repeat the signature to be sure we have a header version */ + char sig2[sig_size]; + int header_version; +}; + +void forget_signature_page(void); +int toi_check_for_signature(void); +int toi_bio_image_exists(int quiet); +int get_signature_page(void); +int toi_bio_mark_resume_attempted(int); +extern char *toi_cur_sig_page; +extern char *toi_orig_sig_page; +int toi_bio_mark_have_image(void); +extern struct sig_data *toi_sig_data; +extern dev_t resume_dev_t; +extern struct block_device *resume_block_device; +extern struct block_device *header_block_device; +extern unsigned long resume_firstblock; + +struct block_device *open_bdev(dev_t device, int display_errs); +extern int current_stream; +extern int more_readahead; +int toi_do_io(int writing, struct block_device *bdev, long block0, + struct page *page, int is_readahead, int syncio, int free_group); +int get_main_pool_phys_params(void); + +void toi_close_bdev(struct block_device *bdev); +struct block_device *toi_open_bdev(char *uuid, dev_t default_device, + int display_errs); + +extern struct toi_module_ops toi_blockwriter_ops; +void dump_block_chains(void); +void debug_broken_header(void); +extern unsigned long raw_pages_allocd, header_pages_reserved; +int toi_bio_chains_debug_info(char *buffer, int size); +void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd); +int toi_bio_scan_for_image(int quiet); +int toi_bio_get_header_version(void); + +void close_resume_dev_t(int force); +int open_resume_dev_t(int force, int quiet); + +struct toi_incremental_image_pointer_saved_data { + unsigned long block; + int chain; +}; + +struct toi_incremental_image_pointer { + struct toi_incremental_image_pointer_saved_data save; + struct block_device *bdev; + unsigned long block; +}; + +void toi_bio_store_inc_image_ptr(struct toi_incremental_image_pointer *ptr); +void toi_bio_restore_inc_image_ptr(struct toi_incremental_image_pointer *ptr); diff -Nur linux-4.2/kernel/power/tuxonice_bio_signature.c linux-4.2-pck/kernel/power/tuxonice_bio_signature.c --- linux-4.2/kernel/power/tuxonice_bio_signature.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_bio_signature.c 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,403 @@ +/* + * kernel/power/tuxonice_bio_signature.c + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * Distributed under GPLv2. + * + */ + +#include + +#include "tuxonice.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_modules.h" +#include "tuxonice_prepare_image.h" +#include "tuxonice_bio.h" +#include "tuxonice_ui.h" +#include "tuxonice_alloc.h" +#include "tuxonice_io.h" +#include "tuxonice_builtin.h" +#include "tuxonice_bio_internal.h" + +struct sig_data *toi_sig_data; + +/* Struct of swap header pages */ + +struct old_sig_data { + dev_t device; + unsigned long sector; + int resume_attempted; + int orig_sig_type; +}; + +union diskpage { + union swap_header swh; /* swh.magic is the only member used */ + struct sig_data sig_data; + struct old_sig_data old_sig_data; +}; + +union p_diskpage { + union diskpage *pointer; + char *ptr; + unsigned long address; +}; + +char *toi_cur_sig_page; +char *toi_orig_sig_page; +int have_image; +int have_old_image; + +int get_signature_page(void) +{ + if (!toi_cur_sig_page) { + toi_message(TOI_IO, TOI_VERBOSE, 0, + "Allocating current signature page."); + toi_cur_sig_page = (char *) toi_get_zeroed_page(38, + TOI_ATOMIC_GFP); + if (!toi_cur_sig_page) { + printk(KERN_ERR "Failed to allocate memory for the " + "current image signature.\n"); + return -ENOMEM; + } + + toi_sig_data = (struct sig_data *) toi_cur_sig_page; + } + + toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx," + " sector %d.", + resume_block_device->bd_dev, resume_firstblock); + + return toi_bio_ops.bdev_page_io(READ, resume_block_device, + resume_firstblock, virt_to_page(toi_cur_sig_page)); +} + +void forget_signature_page(void) +{ + if (toi_cur_sig_page) { + toi_sig_data = NULL; + toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page" + " (%p).", toi_cur_sig_page); + toi_free_page(38, (unsigned long) toi_cur_sig_page); + toi_cur_sig_page = NULL; + } + + if (toi_orig_sig_page) { + toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page" + " (%p).", toi_orig_sig_page); + toi_free_page(38, (unsigned long) toi_orig_sig_page); + toi_orig_sig_page = NULL; + } +} + +/* + * We need to ensure we use the signature page that's currently on disk, + * so as to not remove the image header. Post-atomic-restore, the orig sig + * page will be empty, so we can use that as our method of knowing that we + * need to load the on-disk signature and not use the non-image sig in + * memory. (We're going to powerdown after writing the change, so it's safe. + */ +int toi_bio_mark_resume_attempted(int flag) +{ + toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.", + flag); + if (!toi_orig_sig_page) { + forget_signature_page(); + get_signature_page(); + } + toi_sig_data->resumed_before = flag; + return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, + resume_firstblock, virt_to_page(toi_cur_sig_page)); +} + +int toi_bio_mark_have_image(void) +{ + int result = 0; + char buf[32]; + struct fs_info *fs_info; + + toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists."); + memcpy(toi_sig_data->sig, tuxonice_signature, + sizeof(tuxonice_signature)); + toi_sig_data->have_image = 1; + toi_sig_data->resumed_before = 0; + toi_sig_data->header_dev_t = get_header_dev_t(); + toi_sig_data->have_uuid = 0; + + fs_info = fs_info_from_block_dev(get_header_bdev()); + if (fs_info && !IS_ERR(fs_info)) { + memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16); + free_fs_info(fs_info); + } else + result = (int) PTR_ERR(fs_info); + + if (!result) { + toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.", + format_dev_t(buf, get_header_dev_t())); + toi_sig_data->have_uuid = 1; + } else + toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for " + "dev_t %s.", + format_dev_t(buf, get_header_dev_t())); + + toi_sig_data->first_header_block = get_headerblock(); + have_image = 1; + toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block " + "is %d.", toi_sig_data->header_dev_t, + toi_sig_data->first_header_block); + + memcpy(toi_sig_data->sig2, tuxonice_signature, + sizeof(tuxonice_signature)); + toi_sig_data->header_version = TOI_HEADER_VERSION; + + return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, + resume_firstblock, virt_to_page(toi_cur_sig_page)); +} + +int remove_old_signature(void) +{ + union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page; + char *orig_sig; + char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); + int result; + struct block_device *header_bdev; + struct old_sig_data *old_sig_data = + &swap_header_page.pointer->old_sig_data; + + header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1); + result = toi_bio_ops.bdev_page_io(READ, header_bdev, + old_sig_data->sector, virt_to_page(header_start)); + + if (result) + goto out; + + /* + * TODO: Get the original contents of the first bytes of the swap + * header page. + */ + if (!old_sig_data->orig_sig_type) + orig_sig = "SWAP-SPACE"; + else + orig_sig = "SWAPSPACE2"; + + memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10); + memcpy(swap_header_page.ptr, header_start, 10); + + result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device, + resume_firstblock, virt_to_page(swap_header_page.ptr)); + +out: + toi_close_bdev(header_bdev); + have_old_image = 0; + toi_free_page(38, (unsigned long) header_start); + return result; +} + +/* + * toi_bio_restore_original_signature - restore the original signature + * + * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used. + * It will have the original signature page contents, stored in the image + * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain + * the contents that were loaded when we started the cycle. + */ +int toi_bio_restore_original_signature(void) +{ + char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page; + + if (have_old_image) + return remove_old_signature(); + + if (!use) { + printk("toi_bio_restore_original_signature: No signature " + "page loaded.\n"); + return 0; + } + + toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists."); + have_image = 0; + toi_sig_data->have_image = 0; + return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, + resume_firstblock, virt_to_page(use)); +} + +/* + * check_for_signature - See whether we have an image. + * + * Returns 0 if no image, 1 if there is one, -1 if indeterminate. + */ +int toi_check_for_signature(void) +{ + union p_diskpage swap_header_page; + int type; + const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" }; + const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" }; + char *swap_header; + + if (!toi_cur_sig_page) { + int result = get_signature_page(); + + if (result) + return result; + } + + /* + * Start by looking for the binary header. + */ + if (!memcmp(tuxonice_signature, toi_cur_sig_page, + sizeof(tuxonice_signature))) { + have_image = toi_sig_data->have_image; + toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. " + "Have image is %d.", have_image); + if (have_image) + toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is " + "%x. First block is %d.", + toi_sig_data->header_dev_t, + toi_sig_data->first_header_block); + return toi_sig_data->have_image; + } + + /* + * Failing that, try old file allocator headers. + */ + + if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) { + have_image = 1; + return 1; + } + + have_image = 0; + + if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage))) + return 0; + + /* + * Nope? How about swap? + */ + swap_header_page = (union p_diskpage) toi_cur_sig_page; + swap_header = swap_header_page.pointer->swh.magic.magic; + + /* Normal swapspace? */ + for (type = 0; type < 2; type++) + if (!memcmp(normal_sigs[type], swap_header, + strlen(normal_sigs[type]))) + return 0; + + /* Swsusp or uswsusp? */ + for (type = 0; type < 3; type++) + if (!memcmp(swsusp_sigs[type], swap_header, + strlen(swsusp_sigs[type]))) + return 2; + + /* Old TuxOnIce version? */ + if (!memcmp(tuxonice_signature, swap_header, + sizeof(tuxonice_signature) - 1)) { + toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce " + "signature."); + have_old_image = 1; + return 3; + } + + return -1; +} + +/* + * Image_exists + * + * Returns -1 if don't know, otherwise 0 (no) or 1 (yes). + */ +int toi_bio_image_exists(int quiet) +{ + int result; + char *msg = NULL; + + toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists."); + + if (!resume_dev_t) { + if (!quiet) + printk(KERN_INFO "Not even trying to read header " + "because resume_dev_t is not set.\n"); + return -1; + } + + if (open_resume_dev_t(0, quiet)) + return -1; + + result = toi_check_for_signature(); + + clear_toi_state(TOI_RESUMED_BEFORE); + if (toi_sig_data->resumed_before) + set_toi_state(TOI_RESUMED_BEFORE); + + if (quiet || result == -ENOMEM) + return result; + + if (result == -1) + msg = "TuxOnIce: Unable to find a signature." + " Could you have moved a swap file?\n"; + else if (!result) + msg = "TuxOnIce: No image found.\n"; + else if (result == 1) + msg = "TuxOnIce: Image found.\n"; + else if (result == 2) + msg = "TuxOnIce: uswsusp or swsusp image found.\n"; + else if (result == 3) + msg = "TuxOnIce: Old implementation's signature found.\n"; + + printk(KERN_INFO "%s", msg); + + return result; +} + +int toi_bio_scan_for_image(int quiet) +{ + struct block_device *bdev; + char default_name[255] = ""; + + if (!quiet) + printk(KERN_DEBUG "Scanning swap devices for TuxOnIce " + "signature...\n"); + for (bdev = next_bdev_of_type(NULL, "swap"); bdev; + bdev = next_bdev_of_type(bdev, "swap")) { + int result; + char name[255] = ""; + sprintf(name, "%u:%u", MAJOR(bdev->bd_dev), + MINOR(bdev->bd_dev)); + if (!quiet) + printk(KERN_DEBUG "- Trying %s.\n", name); + resume_block_device = bdev; + resume_dev_t = bdev->bd_dev; + + result = toi_check_for_signature(); + + resume_block_device = NULL; + resume_dev_t = MKDEV(0, 0); + + if (!default_name[0]) + strcpy(default_name, name); + + if (result == 1) { + /* Got one! */ + strcpy(resume_file, name); + next_bdev_of_type(bdev, NULL); + if (!quiet) + printk(KERN_DEBUG " ==> Image found on %s.\n", + resume_file); + return 1; + } + forget_signature_page(); + } + + if (!quiet) + printk(KERN_DEBUG "TuxOnIce scan: No image found.\n"); + strcpy(resume_file, default_name); + return 0; +} + +int toi_bio_get_header_version(void) +{ + return (memcmp(toi_sig_data->sig2, tuxonice_signature, + sizeof(tuxonice_signature))) ? + 0 : toi_sig_data->header_version; + +} diff -Nur linux-4.2/kernel/power/tuxonice_builtin.c linux-4.2-pck/kernel/power/tuxonice_builtin.c --- linux-4.2/kernel/power/tuxonice_builtin.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_builtin.c 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,498 @@ +/* + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tuxonice_io.h" +#include "tuxonice.h" +#include "tuxonice_extent.h" +#include "tuxonice_netlink.h" +#include "tuxonice_prepare_image.h" +#include "tuxonice_ui.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_pagedir.h" +#include "tuxonice_modules.h" +#include "tuxonice_builtin.h" +#include "tuxonice_power_off.h" +#include "tuxonice_alloc.h" + +unsigned long toi_bootflags_mask; + +/* + * Highmem related functions (x86 only). + */ + +#ifdef CONFIG_HIGHMEM + +/** + * copyback_high: Restore highmem pages. + * + * Highmem data and pbe lists are/can be stored in highmem. + * The format is slightly different to the lowmem pbe lists + * used for the assembly code: the last pbe in each page is + * a struct page * instead of struct pbe *, pointing to the + * next page where pbes are stored (or NULL if happens to be + * the end of the list). Since we don't want to generate + * unnecessary deltas against swsusp code, we use a cast + * instead of a union. + **/ + +static void copyback_high(void) +{ + struct page *pbe_page = (struct page *) restore_highmem_pblist; + struct pbe *this_pbe, *first_pbe; + unsigned long *origpage, *copypage; + int pbe_index = 1; + + if (!pbe_page) + return; + + this_pbe = (struct pbe *) kmap_atomic(pbe_page); + first_pbe = this_pbe; + + while (this_pbe) { + int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1; + + origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address)); + copypage = kmap_atomic((struct page *) this_pbe->address); + + while (loop >= 0) { + *(origpage + loop) = *(copypage + loop); + loop--; + } + + kunmap_atomic(origpage); + kunmap_atomic(copypage); + + if (!this_pbe->next) + break; + + if (pbe_index < PBES_PER_PAGE) { + this_pbe++; + pbe_index++; + } else { + pbe_page = (struct page *) this_pbe->next; + kunmap_atomic(first_pbe); + if (!pbe_page) + return; + this_pbe = (struct pbe *) kmap_atomic(pbe_page); + first_pbe = this_pbe; + pbe_index = 1; + } + } + kunmap_atomic(first_pbe); +} + +#else /* CONFIG_HIGHMEM */ +static void copyback_high(void) { } +#endif + +char toi_wait_for_keypress_dev_console(int timeout) +{ + int fd, this_timeout = 255, orig_kthread = 0; + char key = '\0'; + struct termios t, t_backup; + + /* We should be guaranteed /dev/console exists after populate_rootfs() + * in init/main.c. + */ + fd = sys_open("/dev/console", O_RDONLY, 0); + if (fd < 0) { + printk(KERN_INFO "Couldn't open /dev/console.\n"); + return key; + } + + if (sys_ioctl(fd, TCGETS, (long)&t) < 0) + goto out_close; + + memcpy(&t_backup, &t, sizeof(t)); + + t.c_lflag &= ~(ISIG|ICANON|ECHO); + t.c_cc[VMIN] = 0; + +new_timeout: + if (timeout > 0) { + this_timeout = timeout < 26 ? timeout : 25; + timeout -= this_timeout; + this_timeout *= 10; + } + + t.c_cc[VTIME] = this_timeout; + + if (sys_ioctl(fd, TCSETS, (long)&t) < 0) + goto out_restore; + + if (current->flags & PF_KTHREAD) { + orig_kthread = (current->flags & PF_KTHREAD); + current->flags &= ~PF_KTHREAD; + } + + while (1) { + if (sys_read(fd, &key, 1) <= 0) { + if (timeout) + goto new_timeout; + key = '\0'; + break; + } + key = tolower(key); + if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) { + if (key == 'c') { + set_toi_state(TOI_CONTINUE_REQ); + break; + } else if (key == ' ') + break; + } else + break; + } + if (orig_kthread) { + current->flags |= PF_KTHREAD; + } + +out_restore: + sys_ioctl(fd, TCSETS, (long)&t_backup); +out_close: + sys_close(fd); + + return key; +} + +struct toi_boot_kernel_data toi_bkd __nosavedata + __attribute__((aligned(PAGE_SIZE))) = { + MY_BOOT_KERNEL_DATA_VERSION, + 0, +#ifdef CONFIG_TOI_REPLACE_SWSUSP + (1 << TOI_REPLACE_SWSUSP) | +#endif + (1 << TOI_NO_FLUSHER_THREAD) | + (1 << TOI_PAGESET2_FULL), +}; + +struct block_device *toi_open_by_devnum(dev_t dev) +{ + struct block_device *bdev = bdget(dev); + int err = -ENOMEM; + if (bdev) + err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); + return err ? ERR_PTR(err) : bdev; +} + +/** + * toi_close_bdev: Close a swap bdev. + * + * int: The swap entry number to close. + */ +void toi_close_bdev(struct block_device *bdev) +{ + blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); +} + +int toi_wait = CONFIG_TOI_DEFAULT_WAIT; +struct toi_core_fns *toi_core_fns; +unsigned long toi_result; +struct pagedir pagedir1 = {1}; +struct toi_cbw **toi_first_cbw; +int toi_next_cbw; + +unsigned long toi_get_nonconflicting_page(void) +{ + return toi_core_fns->get_nonconflicting_page(); +} + +int toi_post_context_save(void) +{ + return toi_core_fns->post_context_save(); +} + +int try_tuxonice_hibernate(void) +{ + if (!toi_core_fns) + return -ENODEV; + + return toi_core_fns->try_hibernate(); +} + +static int num_resume_calls; +#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL +static int ignore_late_initcall = 1; +#else +static int ignore_late_initcall; +#endif + +int toi_translate_err_default = TOI_CONTINUE_REQ; + +void try_tuxonice_resume(void) +{ + if (!hibernation_available()) + return; + + /* Don't let it wrap around eventually */ + if (num_resume_calls < 2) + num_resume_calls++; + + if (num_resume_calls == 1 && ignore_late_initcall) { + printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n"); + return; + } + + if (toi_core_fns) + toi_core_fns->try_resume(); + else + printk(KERN_INFO "TuxOnIce core not loaded yet.\n"); +} + +int toi_lowlevel_builtin(void) +{ + int error = 0; + + save_processor_state(); + error = swsusp_arch_suspend(); + if (error) + printk(KERN_ERR "Error %d hibernating\n", error); + + /* Restore control flow appears here */ + if (!toi_in_hibernate) { + copyback_high(); + set_toi_state(TOI_NOW_RESUMING); + } + + restore_processor_state(); + return error; +} + +unsigned long toi_compress_bytes_in; +unsigned long toi_compress_bytes_out; + +int toi_in_suspend(void) +{ + return in_suspend; +} + +unsigned long toi_state = ((1 << TOI_BOOT_TIME) | + (1 << TOI_IGNORE_LOGLEVEL) | + (1 << TOI_IO_STOPPED)); + +/* The number of hibernates we have started (some may have been cancelled) */ +unsigned int nr_hibernates; +int toi_running; +__nosavedata int toi_in_hibernate; +__nosavedata struct pbe *restore_highmem_pblist; + +int toi_trace_allocs; + +void toi_read_lock_tasklist(void) +{ + read_lock(&tasklist_lock); +} + +void toi_read_unlock_tasklist(void) +{ + read_unlock(&tasklist_lock); +} + +#ifdef CONFIG_TOI_ZRAM_SUPPORT +int (*toi_flag_zram_disks) (void); + +int toi_do_flag_zram_disks(void) +{ + return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0; +} + +#endif + +/* toi_generate_free_page_map + * + * Description: This routine generates a bitmap of free pages from the + * lists used by the memory manager. We then use the bitmap + * to quickly calculate which pages to save and in which + * pagesets. + */ +void toi_generate_free_page_map(void) +{ + int order, cpu, t; + unsigned long flags, i; + struct zone *zone; + struct list_head *curr; + unsigned long pfn; + struct page *page; + + for_each_populated_zone(zone) { + + if (!zone->spanned_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + + for (i = 0; i < zone->spanned_pages; i++) { + pfn = zone->zone_start_pfn + i; + + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + + ClearPageNosaveFree(page); + } + + for_each_migratetype_order(order, t) { + list_for_each(curr, + &zone->free_area[order].free_list[t]) { + unsigned long j; + + pfn = page_to_pfn(list_entry(curr, struct page, + lru)); + for (j = 0; j < (1UL << order); j++) + SetPageNosaveFree(pfn_to_page(pfn + j)); + } + } + + for_each_online_cpu(cpu) { + struct per_cpu_pageset *pset = + per_cpu_ptr(zone->pageset, cpu); + struct per_cpu_pages *pcp = &pset->pcp; + struct page *page; + int t; + + for (t = 0; t < MIGRATE_PCPTYPES; t++) + list_for_each_entry(page, &pcp->lists[t], lru) + SetPageNosaveFree(page); + } + + spin_unlock_irqrestore(&zone->lock, flags); + } +} + +/* toi_size_of_free_region + * + * Description: Return the number of pages that are free, beginning with and + * including this one. + */ +int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn) +{ + unsigned long this_pfn = start_pfn, + end_pfn = zone_end_pfn(zone); + + while (pfn_valid(this_pfn) && this_pfn < end_pfn && PageNosaveFree(pfn_to_page(this_pfn))) + this_pfn++; + + return this_pfn - start_pfn; +} + +static int __init toi_wait_setup(char *str) +{ + int value; + + if (sscanf(str, "=%d", &value)) { + if (value < -1 || value > 255) + printk(KERN_INFO "TuxOnIce_wait outside range -1 to " + "255.\n"); + else + toi_wait = value; + } + + return 1; +} +__setup("toi_wait", toi_wait_setup); + +static int __init toi_translate_retry_setup(char *str) +{ + toi_translate_err_default = 0; + return 1; +} +__setup("toi_translate_retry", toi_translate_retry_setup); + +static int __init toi_debug_setup(char *str) +{ + toi_bkd.toi_action |= (1 << TOI_LOGALL); + toi_bootflags_mask |= (1 << TOI_LOGALL); + toi_bkd.toi_debug_state = 255; + toi_bkd.toi_default_console_level = 7; + return 1; +} +__setup("toi_debug_setup", toi_debug_setup); + +static int __init toi_pause_setup(char *str) +{ + toi_bkd.toi_action |= (1 << TOI_PAUSE); + toi_bootflags_mask |= (1 << TOI_PAUSE); + return 1; +} +__setup("toi_pause", toi_pause_setup); + +#ifdef CONFIG_PM_DEBUG +static int __init toi_trace_allocs_setup(char *str) +{ + int value; + + if (sscanf(str, "=%d", &value)) + toi_trace_allocs = value; + + return 1; +} +__setup("toi_trace_allocs", toi_trace_allocs_setup); +#endif + +static int __init toi_ignore_late_initcall_setup(char *str) +{ + int value; + + if (sscanf(str, "=%d", &value)) + ignore_late_initcall = value; + + return 1; +} +__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup); + +static int __init toi_force_no_multithreaded_setup(char *str) +{ + int value; + + toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO); + toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO); + + if (sscanf(str, "=%d", &value) && value) + toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO); + + return 1; +} +__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup); + +#ifdef CONFIG_KGDB +static int __init toi_post_resume_breakpoint_setup(char *str) +{ + int value; + + toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT); + toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT); + if (sscanf(str, "=%d", &value) && value) + toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT); + + return 1; +} +__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup); +#endif + +static int __init toi_disable_readahead_setup(char *str) +{ + int value; + + toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD); + toi_bootflags_mask |= (1 << TOI_NO_READAHEAD); + if (sscanf(str, "=%d", &value) && value) + toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD); + + return 1; +} +__setup("toi_no_readahead", toi_disable_readahead_setup); diff -Nur linux-4.2/kernel/power/tuxonice_builtin.h linux-4.2-pck/kernel/power/tuxonice_builtin.h --- linux-4.2/kernel/power/tuxonice_builtin.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_builtin.h 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + */ +#include + +extern struct toi_core_fns *toi_core_fns; +extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out; +extern unsigned int nr_hibernates; +extern int toi_in_hibernate; + +extern __nosavedata struct pbe *restore_highmem_pblist; + +int toi_lowlevel_builtin(void); + +#ifdef CONFIG_HIGHMEM +extern __nosavedata struct zone_data *toi_nosave_zone_list; +extern __nosavedata unsigned long toi_nosave_max_pfn; +#endif + +extern unsigned long toi_get_nonconflicting_page(void); +extern int toi_post_context_save(void); + +extern char toi_wait_for_keypress_dev_console(int timeout); +extern struct block_device *toi_open_by_devnum(dev_t dev); +extern void toi_close_bdev(struct block_device *bdev); +extern int toi_wait; +extern int toi_translate_err_default; +extern int toi_force_no_multithreaded; +extern void toi_read_lock_tasklist(void); +extern void toi_read_unlock_tasklist(void); +extern int toi_in_suspend(void); +extern void toi_generate_free_page_map(void); +extern int toi_size_of_free_region(struct zone *zone, unsigned long start_pfn); + +#ifdef CONFIG_TOI_ZRAM_SUPPORT +extern int toi_do_flag_zram_disks(void); +#else +#define toi_do_flag_zram_disks() (0) +#endif diff -Nur linux-4.2/kernel/power/tuxonice_checksum.c linux-4.2-pck/kernel/power/tuxonice_checksum.c --- linux-4.2/kernel/power/tuxonice_checksum.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_checksum.c 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,392 @@ +/* + * kernel/power/tuxonice_checksum.c + * + * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * This file contains data checksum routines for TuxOnIce, + * using cryptoapi. They are used to locate any modifications + * made to pageset 2 while we're saving it. + */ + +#include +#include +#include +#include +#include + +#include "tuxonice.h" +#include "tuxonice_modules.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_io.h" +#include "tuxonice_pageflags.h" +#include "tuxonice_checksum.h" +#include "tuxonice_pagedir.h" +#include "tuxonice_alloc.h" +#include "tuxonice_ui.h" + +static struct toi_module_ops toi_checksum_ops; + +/* Constant at the mo, but I might allow tuning later */ +static char toi_checksum_name[32] = "md4"; +/* Bytes per checksum */ +#define CHECKSUM_SIZE (16) + +#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE) + +struct cpu_context { + struct crypto_hash *transform; + struct hash_desc desc; + struct scatterlist sg[2]; + char *buf; +}; + +static DEFINE_PER_CPU(struct cpu_context, contexts); +static int pages_allocated; +static unsigned long page_list; + +static int toi_num_resaved; + +static unsigned long this_checksum, next_page; +static int checksum_count; + +static inline int checksum_pages_needed(void) +{ + return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE); +} + +/* ---- Local buffer management ---- */ + +/* + * toi_checksum_cleanup + * + * Frees memory allocated for our labours. + */ +static void toi_checksum_cleanup(int ending_cycle) +{ + int cpu; + + if (ending_cycle) { + for_each_online_cpu(cpu) { + struct cpu_context *this = &per_cpu(contexts, cpu); + if (this->transform) { + crypto_free_hash(this->transform); + this->transform = NULL; + this->desc.tfm = NULL; + } + + if (this->buf) { + toi_free_page(27, (unsigned long) this->buf); + this->buf = NULL; + } + } + } +} + +/* + * toi_crypto_initialise + * + * Prepare to do some work by allocating buffers and transforms. + * Returns: Int: Zero. Even if we can't set up checksum, we still + * seek to hibernate. + */ +static int toi_checksum_initialise(int starting_cycle) +{ + int cpu; + + if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled) + return 0; + + if (!*toi_checksum_name) { + printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n"); + return 1; + } + + for_each_online_cpu(cpu) { + struct cpu_context *this = &per_cpu(contexts, cpu); + struct page *page; + + this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0); + if (IS_ERR(this->transform)) { + printk(KERN_INFO "TuxOnIce: Failed to initialise the " + "%s checksum algorithm: %ld.\n", + toi_checksum_name, (long) this->transform); + this->transform = NULL; + return 1; + } + + this->desc.tfm = this->transform; + this->desc.flags = 0; + + page = toi_alloc_page(27, GFP_KERNEL); + if (!page) + return 1; + this->buf = page_address(page); + sg_init_one(&this->sg[0], this->buf, PAGE_SIZE); + } + return 0; +} + +/* + * toi_checksum_print_debug_stats + * @buffer: Pointer to a buffer into which the debug info will be printed. + * @size: Size of the buffer. + * + * Print information to be recorded for debugging purposes into a buffer. + * Returns: Number of characters written to the buffer. + */ + +static int toi_checksum_print_debug_stats(char *buffer, int size) +{ + int len; + + if (!toi_checksum_ops.enabled) + return scnprintf(buffer, size, + "- Checksumming disabled.\n"); + + len = scnprintf(buffer, size, "- Checksum method is '%s'.\n", + toi_checksum_name); + len += scnprintf(buffer + len, size - len, + " %d pages resaved in atomic copy.\n", toi_num_resaved); + return len; +} + +static int toi_checksum_memory_needed(void) +{ + return toi_checksum_ops.enabled ? + checksum_pages_needed() << PAGE_SHIFT : 0; +} + +static int toi_checksum_storage_needed(void) +{ + if (toi_checksum_ops.enabled) + return strlen(toi_checksum_name) + sizeof(int) + 1; + else + return 0; +} + +/* + * toi_checksum_save_config_info + * @buffer: Pointer to a buffer of size PAGE_SIZE. + * + * Save informaton needed when reloading the image at resume time. + * Returns: Number of bytes used for saving our data. + */ +static int toi_checksum_save_config_info(char *buffer) +{ + int namelen = strlen(toi_checksum_name) + 1; + int total_len; + + *((unsigned int *) buffer) = namelen; + strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen); + total_len = sizeof(unsigned int) + namelen; + return total_len; +} + +/* toi_checksum_load_config_info + * @buffer: Pointer to the start of the data. + * @size: Number of bytes that were saved. + * + * Description: Reload information needed for dechecksuming the image at + * resume time. + */ +static void toi_checksum_load_config_info(char *buffer, int size) +{ + int namelen; + + namelen = *((unsigned int *) (buffer)); + strncpy(toi_checksum_name, buffer + sizeof(unsigned int), + namelen); + return; +} + +/* + * Free Checksum Memory + */ + +void free_checksum_pages(void) +{ + while (pages_allocated) { + unsigned long next = *((unsigned long *) page_list); + ClearPageNosave(virt_to_page(page_list)); + toi_free_page(15, (unsigned long) page_list); + page_list = next; + pages_allocated--; + } +} + +/* + * Allocate Checksum Memory + */ + +int allocate_checksum_pages(void) +{ + int pages_needed = checksum_pages_needed(); + + if (!toi_checksum_ops.enabled) + return 0; + + while (pages_allocated < pages_needed) { + unsigned long *new_page = + (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP); + if (!new_page) { + printk(KERN_ERR "Unable to allocate checksum pages.\n"); + return -ENOMEM; + } + SetPageNosave(virt_to_page(new_page)); + (*new_page) = page_list; + page_list = (unsigned long) new_page; + pages_allocated++; + } + + next_page = (unsigned long) page_list; + checksum_count = 0; + + return 0; +} + +char *tuxonice_get_next_checksum(void) +{ + if (!toi_checksum_ops.enabled) + return NULL; + + if (checksum_count % CHECKSUMS_PER_PAGE) + this_checksum += CHECKSUM_SIZE; + else { + this_checksum = next_page + sizeof(void *); + next_page = *((unsigned long *) next_page); + } + + checksum_count++; + return (char *) this_checksum; +} + +int tuxonice_calc_checksum(struct page *page, char *checksum_locn) +{ + char *pa; + int result, cpu = smp_processor_id(); + struct cpu_context *ctx = &per_cpu(contexts, cpu); + + if (!toi_checksum_ops.enabled) + return 0; + + pa = kmap(page); + memcpy(ctx->buf, pa, PAGE_SIZE); + kunmap(page); + result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, + checksum_locn); + if (result) + printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest " + "returned %d.\n", result); + return result; +} +/* + * Calculate checksums + */ + +void check_checksums(void) +{ + int index = 0, cpu = smp_processor_id(); + char current_checksum[CHECKSUM_SIZE]; + struct cpu_context *ctx = &per_cpu(contexts, cpu); + unsigned long pfn; + + if (!toi_checksum_ops.enabled) { + toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled."); + return; + } + + next_page = (unsigned long) page_list; + + toi_num_resaved = 0; + this_checksum = 0; + + toi_trace_index++; + + toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums."); + memory_bm_position_reset(pageset2_map); + for (pfn = memory_bm_next_pfn(pageset2_map, 0); pfn != BM_END_OF_MAP; + pfn = memory_bm_next_pfn(pageset2_map, 0)) { + int ret, resave_needed = false; + char *pa; + struct page *page = pfn_to_page(pfn); + + if (index < checksum_count) { + if (index % CHECKSUMS_PER_PAGE) { + this_checksum += CHECKSUM_SIZE; + } else { + this_checksum = next_page + sizeof(void *); + next_page = *((unsigned long *) next_page); + } + + /* Done when IRQs disabled so must be atomic */ + pa = kmap_atomic(page); + memcpy(ctx->buf, pa, PAGE_SIZE); + kunmap_atomic(pa); + ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, + current_checksum); + + if (ret) { + printk(KERN_INFO "Digest failed. Returned %d.\n", ret); + return; + } + + resave_needed = memcmp(current_checksum, (char *) this_checksum, + CHECKSUM_SIZE); + } else { + resave_needed = true; + } + + if (resave_needed) { + TOI_TRACE_DEBUG(pfn, "_Resaving %d", resave_needed); + SetPageResave(pfn_to_page(pfn)); + toi_num_resaved++; + if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED)) + set_abort_result(TOI_RESAVE_NEEDED); + } + + index++; + } + toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete."); +} + +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0, + NULL), + SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action, + TOI_ABORT_ON_RESAVE_NEEDED, 0) +}; + +/* + * Ops structure. + */ +static struct toi_module_ops toi_checksum_ops = { + .type = MISC_MODULE, + .name = "checksumming", + .directory = "checksum", + .module = THIS_MODULE, + .initialise = toi_checksum_initialise, + .cleanup = toi_checksum_cleanup, + .print_debug_info = toi_checksum_print_debug_stats, + .save_config_info = toi_checksum_save_config_info, + .load_config_info = toi_checksum_load_config_info, + .memory_needed = toi_checksum_memory_needed, + .storage_needed = toi_checksum_storage_needed, + + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +/* ---- Registration ---- */ +int toi_checksum_init(void) +{ + int result = toi_register_module(&toi_checksum_ops); + return result; +} + +void toi_checksum_exit(void) +{ + toi_unregister_module(&toi_checksum_ops); +} diff -Nur linux-4.2/kernel/power/tuxonice_checksum.h linux-4.2-pck/kernel/power/tuxonice_checksum.h --- linux-4.2/kernel/power/tuxonice_checksum.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_checksum.h 2015-09-08 11:20:32.463678805 -0300 @@ -0,0 +1,31 @@ +/* + * kernel/power/tuxonice_checksum.h + * + * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * This file contains data checksum routines for TuxOnIce, + * using cryptoapi. They are used to locate any modifications + * made to pageset 2 while we're saving it. + */ + +#if defined(CONFIG_TOI_CHECKSUM) +extern int toi_checksum_init(void); +extern void toi_checksum_exit(void); +void check_checksums(void); +int allocate_checksum_pages(void); +void free_checksum_pages(void); +char *tuxonice_get_next_checksum(void); +int tuxonice_calc_checksum(struct page *page, char *checksum_locn); +#else +static inline int toi_checksum_init(void) { return 0; } +static inline void toi_checksum_exit(void) { } +static inline void check_checksums(void) { }; +static inline int allocate_checksum_pages(void) { return 0; }; +static inline void free_checksum_pages(void) { }; +static inline char *tuxonice_get_next_checksum(void) { return NULL; }; +static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn) + { return 0; } +#endif + diff -Nur linux-4.2/kernel/power/tuxonice_cluster.c linux-4.2-pck/kernel/power/tuxonice_cluster.c --- linux-4.2/kernel/power/tuxonice_cluster.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_cluster.c 2015-09-08 11:20:32.467011976 -0300 @@ -0,0 +1,1058 @@ +/* + * kernel/power/tuxonice_cluster.c + * + * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * This file contains routines for cluster hibernation support. + * + * Based on ip autoconfiguration code in net/ipv4/ipconfig.c. + * + * How does it work? + * + * There is no 'master' node that tells everyone else what to do. All nodes + * send messages to the broadcast address/port, maintain a list of peers + * and figure out when to progress to the next step in hibernating or resuming. + * This makes us more fault tolerant when it comes to nodes coming and going + * (which may be more of an issue if we're hibernating when power supplies + * are being unreliable). + * + * At boot time, we start a ktuxonice thread that handles communication with + * other nodes. This node maintains a state machine that controls our progress + * through hibernating and resuming, keeping us in step with other nodes. Nodes + * are identified by their hw address. + * + * On startup, the node sends CLUSTER_PING on the configured interface's + * broadcast address, port $toi_cluster_port (see below) and begins to listen + * for other broadcast messages. CLUSTER_PING messages are repeated at + * intervals of 5 minutes, with a random offset to spread traffic out. + * + * A hibernation cycle is initiated from any node via + * + * echo > /sys/power/tuxonice/do_hibernate + * + * and (possibily) the hibernate script. At each step of the process, the node + * completes its work, and waits for all other nodes to signal completion of + * their work (or timeout) before progressing to the next step. + * + * Request/state Action before reply Possible reply Next state + * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP + * HIBERNATE|NACK INIT_0 + * + * PREP prepare_image PREP|ACK IMAGE_WRITE + * PREP|NACK INIT_0 + * ABORT RUNNING + * + * IO write image IO|ACK power off + * ABORT POST_RESUME + * + * (Boot time) check for image IMAGE|ACK RESUME_PREP + * (Note 1) + * IMAGE|NACK (Note 2) + * + * PREP prepare read image PREP|ACK IMAGE_READ + * PREP|NACK (As NACK_IMAGE) + * + * IO read image IO|ACK POST_RESUME + * + * POST_RESUME thaw, post-script RUNNING + * + * INIT_0 init 0 + * + * Other messages: + * + * - PING: Request for all other live nodes to send a PONG. Used at startup to + * announce presence, when a node is suspected dead and periodically, in case + * segments of the network are [un]plugged. + * + * - PONG: Response to a PING. + * + * - ABORT: Request to cancel writing an image. + * + * - BYE: Notification that this node is shutting down. + * + * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that + * nodes which are slower to start up can get state synchronised. If a node + * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send + * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it + * must invalidate its image (if any) and boot normally. + * + * Note 2: May occur when one node lost power or powered off while others + * hibernated. This node waits for others to complete resuming (ACK_READ) + * before completing its boot, so that it appears as a fail node restarting. + * + * If any node has an image, then it also has a list of nodes that hibernated + * in synchronisation with it. The node will wait for other nodes to appear + * or timeout before beginning its restoration. + * + * If a node has no image, it needs to wait, in case other nodes which do have + * an image are going to resume, but are taking longer to announce their + * presence. For this reason, the user can specify a timeout value and a number + * of nodes detected before we just continue. (We might want to assume in a + * cluster of, say, 15 nodes, if 8 others have booted without finding an image, + * the remaining nodes will too. This might help in situations where some nodes + * are much slower to boot, or more subject to hardware failures or such like). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tuxonice.h" +#include "tuxonice_modules.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_alloc.h" +#include "tuxonice_io.h" + +#if 1 +#define PRINTK(a, b...) do { printk(a, ##b); } while (0) +#else +#define PRINTK(a, b...) do { } while (0) +#endif + +static int loopback_mode; +static int num_local_nodes = 1; +#define MAX_LOCAL_NODES 8 +#define SADDR (loopback_mode ? b->sid : h->saddr) + +#define MYNAME "TuxOnIce Clustering" + +enum cluster_message { + MSG_ACK = 1, + MSG_NACK = 2, + MSG_PING = 4, + MSG_ABORT = 8, + MSG_BYE = 16, + MSG_HIBERNATE = 32, + MSG_IMAGE = 64, + MSG_IO = 128, + MSG_RUNNING = 256 +}; + +static char *str_message(int message) +{ + switch (message) { + case 4: + return "Ping"; + case 8: + return "Abort"; + case 9: + return "Abort acked"; + case 10: + return "Abort nacked"; + case 16: + return "Bye"; + case 17: + return "Bye acked"; + case 18: + return "Bye nacked"; + case 32: + return "Hibernate request"; + case 33: + return "Hibernate ack"; + case 34: + return "Hibernate nack"; + case 64: + return "Image exists?"; + case 65: + return "Image does exist"; + case 66: + return "No image here"; + case 128: + return "I/O"; + case 129: + return "I/O okay"; + case 130: + return "I/O failed"; + case 256: + return "Running"; + default: + printk(KERN_ERR "Unrecognised message %d.\n", message); + return "Unrecognised message (see dmesg)"; + } +} + +#define MSG_ACK_MASK (MSG_ACK | MSG_NACK) +#define MSG_STATE_MASK (~MSG_ACK_MASK) + +struct node_info { + struct list_head member_list; + wait_queue_head_t member_events; + spinlock_t member_list_lock; + spinlock_t receive_lock; + int peer_count, ignored_peer_count; + struct toi_sysfs_data sysfs_data; + enum cluster_message current_message; +}; + +struct node_info node_array[MAX_LOCAL_NODES]; + +struct cluster_member { + __be32 addr; + enum cluster_message message; + struct list_head list; + int ignore; +}; + +#define toi_cluster_port_send 3501 +#define toi_cluster_port_recv 3502 + +static struct net_device *net_dev; +static struct toi_module_ops toi_cluster_ops; + +static int toi_recv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); + +static struct packet_type toi_cluster_packet_type = { + .type = __constant_htons(ETH_P_IP), + .func = toi_recv, +}; + +struct toi_pkt { /* BOOTP packet format */ + struct iphdr iph; /* IP header */ + struct udphdr udph; /* UDP header */ + u8 htype; /* HW address type */ + u8 hlen; /* HW address length */ + __be32 xid; /* Transaction ID */ + __be16 secs; /* Seconds since we started */ + __be16 flags; /* Just what it says */ + u8 hw_addr[16]; /* Sender's HW address */ + u16 message; /* Message */ + unsigned long sid; /* Source ID for loopback testing */ +}; + +static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE; + +static int added_pack; + +static int others_have_image; + +/* Key used to allow multiple clusters on the same lan */ +static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY; +static char pre_hibernate_script[255] = + CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE; +static char post_hibernate_script[255] = + CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE; + +/* List of cluster members */ +static unsigned long continue_delay = 5 * HZ; +static unsigned long cluster_message_timeout = 3 * HZ; + +/* === Membership list === */ + +static void print_member_info(int index) +{ + struct cluster_member *this; + + printk(KERN_INFO "==> Dumping node %d.\n", index); + + list_for_each_entry(this, &node_array[index].member_list, list) + printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n", + NIPQUAD(this->addr), + str_message(this->message), + this->ignore ? "(Ignored)" : ""); + printk(KERN_INFO "== Done ==\n"); +} + +static struct cluster_member *__find_member(int index, __be32 addr) +{ + struct cluster_member *this; + + list_for_each_entry(this, &node_array[index].member_list, list) { + if (this->addr != addr) + continue; + + return this; + } + + return NULL; +} + +static void set_ignore(int index, __be32 addr, struct cluster_member *this) +{ + if (this->ignore) { + PRINTK("Node %d already ignoring %d.%d.%d.%d.\n", + index, NIPQUAD(addr)); + return; + } + + PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n", + index, NIPQUAD(addr)); + this->ignore = 1; + node_array[index].ignored_peer_count++; +} + +static int __add_update_member(int index, __be32 addr, int message) +{ + struct cluster_member *this; + + this = __find_member(index, addr); + if (this) { + if (this->message != message) { + this->message = message; + if ((message & MSG_NACK) && + (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) + set_ignore(index, addr, this); + PRINTK("Node %d sees node %d.%d.%d.%d now sending " + "%s.\n", index, NIPQUAD(addr), + str_message(message)); + wake_up(&node_array[index].member_events); + } + return 0; + } + + this = (struct cluster_member *) toi_kzalloc(36, + sizeof(struct cluster_member), GFP_KERNEL); + + if (!this) + return -1; + + this->addr = addr; + this->message = message; + this->ignore = 0; + INIT_LIST_HEAD(&this->list); + + node_array[index].peer_count++; + + PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index, + NIPQUAD(addr), str_message(message)); + + if ((message & MSG_NACK) && + (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) + set_ignore(index, addr, this); + list_add_tail(&this->list, &node_array[index].member_list); + return 1; +} + +static int add_update_member(int index, __be32 addr, int message) +{ + int result; + unsigned long flags; + spin_lock_irqsave(&node_array[index].member_list_lock, flags); + result = __add_update_member(index, addr, message); + spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); + + print_member_info(index); + + wake_up(&node_array[index].member_events); + + return result; +} + +static void del_member(int index, __be32 addr) +{ + struct cluster_member *this; + unsigned long flags; + + spin_lock_irqsave(&node_array[index].member_list_lock, flags); + this = __find_member(index, addr); + + if (this) { + list_del_init(&this->list); + toi_kfree(36, this, sizeof(*this)); + node_array[index].peer_count--; + } + + spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); +} + +/* === Message transmission === */ + +static void toi_send_if(int message, unsigned long my_id); + +/* + * Process received TOI packet. + */ +static int toi_recv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct toi_pkt *b; + struct iphdr *h; + int len, result, index; + unsigned long addr, message, ack; + + /* Perform verifications before taking the lock. */ + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + if (dev != net_dev) + goto drop; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_RX_DROP; + + if (!pskb_may_pull(skb, + sizeof(struct iphdr) + + sizeof(struct udphdr))) + goto drop; + + b = (struct toi_pkt *)skb_network_header(skb); + h = &b->iph; + + if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) + goto drop; + + /* Fragments are not supported */ + if (h->frag_off & htons(IP_OFFSET | IP_MF)) { + if (net_ratelimit()) + printk(KERN_ERR "TuxOnIce: Ignoring fragmented " + "cluster message.\n"); + goto drop; + } + + if (skb->len < ntohs(h->tot_len)) + goto drop; + + if (ip_fast_csum((char *) h, h->ihl)) + goto drop; + + if (b->udph.source != htons(toi_cluster_port_send) || + b->udph.dest != htons(toi_cluster_port_recv)) + goto drop; + + if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) + goto drop; + + len = ntohs(b->udph.len) - sizeof(struct udphdr); + + /* Ok the front looks good, make sure we can get at the rest. */ + if (!pskb_may_pull(skb, skb->len)) + goto drop; + + b = (struct toi_pkt *)skb_network_header(skb); + h = &b->iph; + + addr = SADDR; + PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n", + str_message(b->message), NIPQUAD(addr)); + + message = b->message & MSG_STATE_MASK; + ack = b->message & MSG_ACK_MASK; + + for (index = 0; index < num_local_nodes; index++) { + int new_message = node_array[index].current_message, + old_message = new_message; + + if (index == SADDR || !old_message) { + PRINTK("Ignoring node %d (offline or self).\n", index); + continue; + } + + /* One message at a time, please. */ + spin_lock(&node_array[index].receive_lock); + + result = add_update_member(index, SADDR, b->message); + if (result == -1) { + printk(KERN_INFO "Failed to add new cluster member " + NIPQUAD_FMT ".\n", + NIPQUAD(addr)); + goto drop_unlock; + } + + switch (b->message & MSG_STATE_MASK) { + case MSG_PING: + break; + case MSG_ABORT: + break; + case MSG_BYE: + break; + case MSG_HIBERNATE: + /* Can I hibernate? */ + new_message = MSG_HIBERNATE | + ((index & 1) ? MSG_NACK : MSG_ACK); + break; + case MSG_IMAGE: + /* Can I resume? */ + new_message = MSG_IMAGE | + ((index & 1) ? MSG_NACK : MSG_ACK); + if (new_message != old_message) + printk(KERN_ERR "Setting whether I can resume " + "to %d.\n", new_message); + break; + case MSG_IO: + new_message = MSG_IO | MSG_ACK; + break; + case MSG_RUNNING: + break; + default: + if (net_ratelimit()) + printk(KERN_ERR "Unrecognised TuxOnIce cluster" + " message %d from " NIPQUAD_FMT ".\n", + b->message, NIPQUAD(addr)); + }; + + if (old_message != new_message) { + node_array[index].current_message = new_message; + printk(KERN_INFO ">>> Sending new message for node " + "%d.\n", index); + toi_send_if(new_message, index); + } else if (!ack) { + printk(KERN_INFO ">>> Resending message for node %d.\n", + index); + toi_send_if(new_message, index); + } +drop_unlock: + spin_unlock(&node_array[index].receive_lock); + }; + +drop: + /* Throw the packet out. */ + kfree_skb(skb); + + return 0; +} + +/* + * Send cluster message to single interface. + */ +static void toi_send_if(int message, unsigned long my_id) +{ + struct sk_buff *skb; + struct toi_pkt *b; + int hh_len = LL_RESERVED_SPACE(net_dev); + struct iphdr *h; + + /* Allocate packet */ + skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL); + if (!skb) + return; + skb_reserve(skb, hh_len); + b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt)); + memset(b, 0, sizeof(struct toi_pkt)); + + /* Construct IP header */ + skb_reset_network_header(skb); + h = ip_hdr(skb); + h->version = 4; + h->ihl = 5; + h->tot_len = htons(sizeof(struct toi_pkt)); + h->frag_off = htons(IP_DF); + h->ttl = 64; + h->protocol = IPPROTO_UDP; + h->daddr = htonl(INADDR_BROADCAST); + h->check = ip_fast_csum((unsigned char *) h, h->ihl); + + /* Construct UDP header */ + b->udph.source = htons(toi_cluster_port_send); + b->udph.dest = htons(toi_cluster_port_recv); + b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr)); + /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ + + /* Construct message */ + b->message = message; + b->sid = my_id; + b->htype = net_dev->type; /* can cause undefined behavior */ + b->hlen = net_dev->addr_len; + memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len); + b->secs = htons(3); /* 3 seconds */ + + /* Chain packet down the line... */ + skb->dev = net_dev; + skb->protocol = htons(ETH_P_IP); + if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol), + net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) || + dev_queue_xmit(skb) < 0) + printk(KERN_INFO "E"); +} + +/* ========================================= */ + +/* kTOICluster */ + +static atomic_t num_cluster_threads; +static DECLARE_WAIT_QUEUE_HEAD(clusterd_events); + +static int kTOICluster(void *data) +{ + unsigned long my_id; + + my_id = atomic_add_return(1, &num_cluster_threads) - 1; + node_array[my_id].current_message = (unsigned long) data; + + PRINTK("kTOICluster daemon %lu starting.\n", my_id); + + current->flags |= PF_NOFREEZE; + + while (node_array[my_id].current_message) { + toi_send_if(node_array[my_id].current_message, my_id); + sleep_on_timeout(&clusterd_events, + cluster_message_timeout); + PRINTK("Link state %lu is %d.\n", my_id, + node_array[my_id].current_message); + } + + toi_send_if(MSG_BYE, my_id); + atomic_dec(&num_cluster_threads); + wake_up(&clusterd_events); + + PRINTK("kTOICluster daemon %lu exiting.\n", my_id); + __set_current_state(TASK_RUNNING); + return 0; +} + +static void kill_clusterd(void) +{ + int i; + + for (i = 0; i < num_local_nodes; i++) { + if (node_array[i].current_message) { + PRINTK("Seeking to kill clusterd %d.\n", i); + node_array[i].current_message = 0; + } + } + wait_event(clusterd_events, + !atomic_read(&num_cluster_threads)); + PRINTK("All cluster daemons have exited.\n"); +} + +static int peers_not_in_message(int index, int message, int precise) +{ + struct cluster_member *this; + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&node_array[index].member_list_lock, flags); + list_for_each_entry(this, &node_array[index].member_list, list) { + if (this->ignore) + continue; + + PRINTK("Peer %d.%d.%d.%d sending %s. " + "Seeking %s.\n", + NIPQUAD(this->addr), + str_message(this->message), str_message(message)); + if ((precise ? this->message : + this->message & MSG_STATE_MASK) != + message) + result++; + } + spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); + PRINTK("%d peers in sought message.\n", result); + return result; +} + +static void reset_ignored(int index) +{ + struct cluster_member *this; + unsigned long flags; + + spin_lock_irqsave(&node_array[index].member_list_lock, flags); + list_for_each_entry(this, &node_array[index].member_list, list) + this->ignore = 0; + node_array[index].ignored_peer_count = 0; + spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); +} + +static int peers_in_message(int index, int message, int precise) +{ + return node_array[index].peer_count - + node_array[index].ignored_peer_count - + peers_not_in_message(index, message, precise); +} + +static int time_to_continue(int index, unsigned long start, int message) +{ + int first = peers_not_in_message(index, message, 0); + int second = peers_in_message(index, message, 1); + + PRINTK("First part returns %d, second returns %d.\n", first, second); + + if (!first && !second) { + PRINTK("All peers answered message %d.\n", + message); + return 1; + } + + if (time_after(jiffies, start + continue_delay)) { + PRINTK("Timeout reached.\n"); + return 1; + } + + PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies, + start + continue_delay); + return 0; +} + +void toi_initiate_cluster_hibernate(void) +{ + int result; + unsigned long start; + + result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); + if (result) + return; + + toi_send_if(MSG_HIBERNATE, 0); + + start = jiffies; + wait_event(node_array[0].member_events, + time_to_continue(0, start, MSG_HIBERNATE)); + + if (test_action_state(TOI_FREEZER_TEST)) { + toi_send_if(MSG_ABORT, 0); + + start = jiffies; + wait_event(node_array[0].member_events, + time_to_continue(0, start, MSG_RUNNING)); + + do_toi_step(STEP_QUIET_CLEANUP); + return; + } + + toi_send_if(MSG_IO, 0); + + result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); + if (result) + return; + + /* This code runs at resume time too! */ + if (toi_in_hibernate) + result = do_toi_step(STEP_HIBERNATE_POWERDOWN); +} + +/* toi_cluster_print_debug_stats + * + * Description: Print information to be recorded for debugging purposes into a + * buffer. + * Arguments: buffer: Pointer to a buffer into which the debug info will be + * printed. + * size: Size of the buffer. + * Returns: Number of characters written to the buffer. + */ +static int toi_cluster_print_debug_stats(char *buffer, int size) +{ + int len; + + if (strlen(toi_cluster_iface)) + len = scnprintf(buffer, size, + "- Cluster interface is '%s'.\n", + toi_cluster_iface); + else + len = scnprintf(buffer, size, + "- Cluster support is disabled.\n"); + return len; +} + +/* cluster_memory_needed + * + * Description: Tell the caller how much memory we need to operate during + * hibernate/resume. + * Returns: Unsigned long. Maximum number of bytes of memory required for + * operation. + */ +static int toi_cluster_memory_needed(void) +{ + return 0; +} + +static int toi_cluster_storage_needed(void) +{ + return 1 + strlen(toi_cluster_iface); +} + +/* toi_cluster_save_config_info + * + * Description: Save informaton needed when reloading the image at resume time. + * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE. + * Returns: Number of bytes used for saving our data. + */ +static int toi_cluster_save_config_info(char *buffer) +{ + strcpy(buffer, toi_cluster_iface); + return strlen(toi_cluster_iface + 1); +} + +/* toi_cluster_load_config_info + * + * Description: Reload information needed for declustering the image at + * resume time. + * Arguments: Buffer: Pointer to the start of the data. + * Size: Number of bytes that were saved. + */ +static void toi_cluster_load_config_info(char *buffer, int size) +{ + strncpy(toi_cluster_iface, buffer, size); + return; +} + +static void cluster_startup(void) +{ + int have_image = do_check_can_resume(), i; + unsigned long start = jiffies, initial_message; + struct task_struct *p; + + initial_message = MSG_IMAGE; + + have_image = 1; + + for (i = 0; i < num_local_nodes; i++) { + PRINTK("Starting ktoiclusterd %d.\n", i); + p = kthread_create(kTOICluster, (void *) initial_message, + "ktoiclusterd/%d", i); + if (IS_ERR(p)) { + printk(KERN_ERR "Failed to start ktoiclusterd.\n"); + return; + } + + wake_up_process(p); + } + + /* Wait for delay or someone else sending first message */ + wait_event(node_array[0].member_events, time_to_continue(0, start, + MSG_IMAGE)); + + others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1); + + printk(KERN_INFO "Continuing. I %shave an image. Peers with image:" + " %d.\n", have_image ? "" : "don't ", others_have_image); + + if (have_image) { + int result; + + /* Start to resume */ + printk(KERN_INFO " === Starting to resume === \n"); + node_array[0].current_message = MSG_IO; + toi_send_if(MSG_IO, 0); + + /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */ + result = 0; + + if (!result) { + /* + * Atomic restore - we'll come back in the hibernation + * path. + */ + + /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */ + result = 0; + + /* do_toi_step(STEP_QUIET_CLEANUP); */ + } + + node_array[0].current_message |= MSG_NACK; + + /* For debugging - disable for real life? */ + wait_event(node_array[0].member_events, + time_to_continue(0, start, MSG_IO)); + } + + if (others_have_image) { + /* Wait for them to resume */ + printk(KERN_INFO "Waiting for other nodes to resume.\n"); + start = jiffies; + wait_event(node_array[0].member_events, + time_to_continue(0, start, MSG_RUNNING)); + if (peers_not_in_message(0, MSG_RUNNING, 0)) + printk(KERN_INFO "Timed out while waiting for other " + "nodes to resume.\n"); + } + + /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE + * as appropriate. + * + * If we don't have an image: + * - Wait until someone else says they have one, or conditions are met + * for continuing to boot (n machines or t seconds). + * - If anyone has an image, wait for them to resume before continuing + * to boot. + * + * If we have an image: + * - Wait until conditions are met before continuing to resume (n + * machines or t seconds). Send RESUME_PREP and freeze processes. + * NACK_PREP if freezing fails (shouldn't) and follow logic for + * us having no image above. On success, wait for [N]ACK_PREP from + * other machines. Read image (including atomic restore) until done. + * Wait for ACK_READ from others (should never fail). Thaw processes + * and do post-resume. (The section after the atomic restore is done + * via the code for hibernating). + */ + + node_array[0].current_message = MSG_RUNNING; +} + +/* toi_cluster_open_iface + * + * Description: Prepare to use an interface. + */ + +static int toi_cluster_open_iface(void) +{ + struct net_device *dev; + + rtnl_lock(); + + for_each_netdev(&init_net, dev) { + if (/* dev == &init_net.loopback_dev || */ + strcmp(dev->name, toi_cluster_iface)) + continue; + + net_dev = dev; + break; + } + + rtnl_unlock(); + + if (!net_dev) { + printk(KERN_ERR MYNAME ": Device %s not found.\n", + toi_cluster_iface); + return -ENODEV; + } + + dev_add_pack(&toi_cluster_packet_type); + added_pack = 1; + + loopback_mode = (net_dev == init_net.loopback_dev); + num_local_nodes = loopback_mode ? 8 : 1; + + PRINTK("Loopback mode is %s. Number of local nodes is %d.\n", + loopback_mode ? "on" : "off", num_local_nodes); + + cluster_startup(); + return 0; +} + +/* toi_cluster_close_iface + * + * Description: Stop using an interface. + */ + +static int toi_cluster_close_iface(void) +{ + kill_clusterd(); + if (added_pack) { + dev_remove_pack(&toi_cluster_packet_type); + added_pack = 0; + } + return 0; +} + +static void write_side_effect(void) +{ + if (toi_cluster_ops.enabled) { + toi_cluster_open_iface(); + set_toi_state(TOI_CLUSTER_MODE); + } else { + toi_cluster_close_iface(); + clear_toi_state(TOI_CLUSTER_MODE); + } +} + +static void node_write_side_effect(void) +{ +} + +/* + * data for our sysfs entries. + */ +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0, + NULL), + SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0, + write_side_effect), + SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL), + SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script, + 256, 0, NULL), + SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script, + 256, 0, STRING), + SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ, + 0) +}; + +/* + * Ops structure. + */ + +static struct toi_module_ops toi_cluster_ops = { + .type = FILTER_MODULE, + .name = "Cluster", + .directory = "cluster", + .module = THIS_MODULE, + .memory_needed = toi_cluster_memory_needed, + .print_debug_info = toi_cluster_print_debug_stats, + .save_config_info = toi_cluster_save_config_info, + .load_config_info = toi_cluster_load_config_info, + .storage_needed = toi_cluster_storage_needed, + + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +/* ---- Registration ---- */ + +#ifdef MODULE +#define INIT static __init +#define EXIT static __exit +#else +#define INIT +#define EXIT +#endif + +INIT int toi_cluster_init(void) +{ + int temp = toi_register_module(&toi_cluster_ops), i; + struct kobject *kobj = toi_cluster_ops.dir_kobj; + + for (i = 0; i < MAX_LOCAL_NODES; i++) { + node_array[i].current_message = 0; + INIT_LIST_HEAD(&node_array[i].member_list); + init_waitqueue_head(&node_array[i].member_events); + spin_lock_init(&node_array[i].member_list_lock); + spin_lock_init(&node_array[i].receive_lock); + + /* Set up sysfs entry */ + node_array[i].sysfs_data.attr.name = toi_kzalloc(8, + sizeof(node_array[i].sysfs_data.attr.name), + GFP_KERNEL); + sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d", + i); + node_array[i].sysfs_data.attr.mode = SYSFS_RW; + node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER; + node_array[i].sysfs_data.flags = 0; + node_array[i].sysfs_data.data.integer.variable = + (int *) &node_array[i].current_message; + node_array[i].sysfs_data.data.integer.minimum = 0; + node_array[i].sysfs_data.data.integer.maximum = INT_MAX; + node_array[i].sysfs_data.write_side_effect = + node_write_side_effect; + toi_register_sysfs_file(kobj, &node_array[i].sysfs_data); + } + + toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0); + + if (toi_cluster_ops.enabled) + toi_cluster_open_iface(); + + return temp; +} + +EXIT void toi_cluster_exit(void) +{ + int i; + toi_cluster_close_iface(); + + for (i = 0; i < MAX_LOCAL_NODES; i++) + toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj, + &node_array[i].sysfs_data); + toi_unregister_module(&toi_cluster_ops); +} + +static int __init toi_cluster_iface_setup(char *iface) +{ + toi_cluster_ops.enabled = (*iface && + strcmp(iface, "off")); + + if (toi_cluster_ops.enabled) + strncpy(toi_cluster_iface, iface, strlen(iface)); +} + +__setup("toi_cluster=", toi_cluster_iface_setup); diff -Nur linux-4.2/kernel/power/tuxonice_cluster.h linux-4.2-pck/kernel/power/tuxonice_cluster.h --- linux-4.2/kernel/power/tuxonice_cluster.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_cluster.h 2015-09-08 11:20:32.467011976 -0300 @@ -0,0 +1,18 @@ +/* + * kernel/power/tuxonice_cluster.h + * + * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + */ + +#ifdef CONFIG_TOI_CLUSTER +extern int toi_cluster_init(void); +extern void toi_cluster_exit(void); +extern void toi_initiate_cluster_hibernate(void); +#else +static inline int toi_cluster_init(void) { return 0; } +static inline void toi_cluster_exit(void) { } +static inline void toi_initiate_cluster_hibernate(void) { } +#endif + diff -Nur linux-4.2/kernel/power/tuxonice_compress.c linux-4.2-pck/kernel/power/tuxonice_compress.c --- linux-4.2/kernel/power/tuxonice_compress.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_compress.c 2015-09-08 11:20:32.467011976 -0300 @@ -0,0 +1,452 @@ +/* + * kernel/power/compression.c + * + * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * This file contains data compression routines for TuxOnIce, + * using cryptoapi. + */ + +#include +#include +#include +#include + +#include "tuxonice_builtin.h" +#include "tuxonice.h" +#include "tuxonice_modules.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_io.h" +#include "tuxonice_ui.h" +#include "tuxonice_alloc.h" + +static int toi_expected_compression; + +static struct toi_module_ops toi_compression_ops; +static struct toi_module_ops *next_driver; + +static char toi_compressor_name[32] = "lzo"; + +static DEFINE_MUTEX(stats_lock); + +struct cpu_context { + u8 *page_buffer; + struct crypto_comp *transform; + unsigned int len; + u8 *buffer_start; + u8 *output_buffer; +}; + +#define OUT_BUF_SIZE (2 * PAGE_SIZE) + +static DEFINE_PER_CPU(struct cpu_context, contexts); + +/* + * toi_crypto_prepare + * + * Prepare to do some work by allocating buffers and transforms. + */ +static int toi_compress_crypto_prepare(void) +{ + int cpu; + + if (!*toi_compressor_name) { + printk(KERN_INFO "TuxOnIce: Compression enabled but no " + "compressor name set.\n"); + return 1; + } + + for_each_online_cpu(cpu) { + struct cpu_context *this = &per_cpu(contexts, cpu); + this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0); + if (IS_ERR(this->transform)) { + printk(KERN_INFO "TuxOnIce: Failed to initialise the " + "%s compression transform.\n", + toi_compressor_name); + this->transform = NULL; + return 1; + } + + this->page_buffer = + (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP); + + if (!this->page_buffer) { + printk(KERN_ERR + "Failed to allocate a page buffer for TuxOnIce " + "compression driver.\n"); + return -ENOMEM; + } + + this->output_buffer = + (char *) vmalloc_32(OUT_BUF_SIZE); + + if (!this->output_buffer) { + printk(KERN_ERR + "Failed to allocate a output buffer for TuxOnIce " + "compression driver.\n"); + return -ENOMEM; + } + } + + return 0; +} + +static int toi_compress_rw_cleanup(int writing) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct cpu_context *this = &per_cpu(contexts, cpu); + if (this->transform) { + crypto_free_comp(this->transform); + this->transform = NULL; + } + + if (this->page_buffer) + toi_free_page(16, (unsigned long) this->page_buffer); + + this->page_buffer = NULL; + + if (this->output_buffer) + vfree(this->output_buffer); + + this->output_buffer = NULL; + } + + return 0; +} + +/* + * toi_compress_init + */ + +static int toi_compress_init(int toi_or_resume) +{ + if (!toi_or_resume) + return 0; + + toi_compress_bytes_in = 0; + toi_compress_bytes_out = 0; + + next_driver = toi_get_next_filter(&toi_compression_ops); + + return next_driver ? 0 : -ECHILD; +} + +/* + * toi_compress_rw_init() + */ + +static int toi_compress_rw_init(int rw, int stream_number) +{ + if (toi_compress_crypto_prepare()) { + printk(KERN_ERR "Failed to initialise compression " + "algorithm.\n"); + if (rw == READ) { + printk(KERN_INFO "Unable to read the image.\n"); + return -ENODEV; + } else { + printk(KERN_INFO "Continuing without " + "compressing the image.\n"); + toi_compression_ops.enabled = 0; + } + } + + return 0; +} + +/* + * toi_compress_write_page() + * + * Compress a page of data, buffering output and passing on filled + * pages to the next module in the pipeline. + * + * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing + * data to be compressed. + * + * Returns: 0 on success. Otherwise the error is that returned by later + * modules, -ECHILD if we have a broken pipeline or -EIO if + * zlib errs. + */ +static int toi_compress_write_page(unsigned long index, int buf_type, + void *buffer_page, unsigned int buf_size) +{ + int ret = 0, cpu = smp_processor_id(); + struct cpu_context *ctx = &per_cpu(contexts, cpu); + u8* output_buffer = buffer_page; + int output_len = buf_size; + int out_buf_type = buf_type; + + if (ctx->transform) { + + ctx->buffer_start = TOI_MAP(buf_type, buffer_page); + ctx->len = OUT_BUF_SIZE; + + ret = crypto_comp_compress(ctx->transform, + ctx->buffer_start, buf_size, + ctx->output_buffer, &ctx->len); + + TOI_UNMAP(buf_type, buffer_page); + + toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, + "CPU %d, index %lu: %d bytes", + cpu, index, ctx->len); + + if (!ret && ctx->len < buf_size) { /* some compression */ + output_buffer = ctx->output_buffer; + output_len = ctx->len; + out_buf_type = TOI_VIRT; + } + + } + + mutex_lock(&stats_lock); + + toi_compress_bytes_in += buf_size; + toi_compress_bytes_out += output_len; + + mutex_unlock(&stats_lock); + + if (!ret) + ret = next_driver->write_page(index, out_buf_type, + output_buffer, output_len); + + return ret; +} + +/* + * toi_compress_read_page() + * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. + * + * Retrieve data from later modules and decompress it until the input buffer + * is filled. + * Zero if successful. Error condition from me or from downstream on failure. + */ +static int toi_compress_read_page(unsigned long *index, int buf_type, + void *buffer_page, unsigned int *buf_size) +{ + int ret, cpu = smp_processor_id(); + unsigned int len; + unsigned int outlen = PAGE_SIZE; + char *buffer_start; + struct cpu_context *ctx = &per_cpu(contexts, cpu); + + if (!ctx->transform) + return next_driver->read_page(index, TOI_PAGE, buffer_page, + buf_size); + + /* + * All our reads must be synchronous - we can't decompress + * data that hasn't been read yet. + */ + + ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len); + + buffer_start = kmap(buffer_page); + + /* Error or uncompressed data */ + if (ret || len == PAGE_SIZE) { + memcpy(buffer_start, ctx->page_buffer, len); + goto out; + } + + ret = crypto_comp_decompress( + ctx->transform, + ctx->page_buffer, + len, buffer_start, &outlen); + + toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, + "CPU %d, index %lu: %d=>%d (%d).", + cpu, *index, len, outlen, ret); + + if (ret) + abort_hibernate(TOI_FAILED_IO, + "Compress_read returned %d.\n", ret); + else if (outlen != PAGE_SIZE) { + abort_hibernate(TOI_FAILED_IO, + "Decompression yielded %d bytes instead of %ld.\n", + outlen, PAGE_SIZE); + printk(KERN_ERR "Decompression yielded %d bytes instead of " + "%ld.\n", outlen, PAGE_SIZE); + ret = -EIO; + *buf_size = outlen; + } +out: + TOI_UNMAP(buf_type, buffer_page); + return ret; +} + +/* + * toi_compress_print_debug_stats + * @buffer: Pointer to a buffer into which the debug info will be printed. + * @size: Size of the buffer. + * + * Print information to be recorded for debugging purposes into a buffer. + * Returns: Number of characters written to the buffer. + */ + +static int toi_compress_print_debug_stats(char *buffer, int size) +{ + unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT, + pages_out = toi_compress_bytes_out >> PAGE_SHIFT; + int len; + + /* Output the compression ratio achieved. */ + if (*toi_compressor_name) + len = scnprintf(buffer, size, "- Compressor is '%s'.\n", + toi_compressor_name); + else + len = scnprintf(buffer, size, "- Compressor is not set.\n"); + + if (pages_in) + len += scnprintf(buffer+len, size - len, " Compressed " + "%lu bytes into %lu (%ld percent compression).\n", + toi_compress_bytes_in, + toi_compress_bytes_out, + (pages_in - pages_out) * 100 / pages_in); + return len; +} + +/* + * toi_compress_compression_memory_needed + * + * Tell the caller how much memory we need to operate during hibernate/resume. + * Returns: Unsigned long. Maximum number of bytes of memory required for + * operation. + */ +static int toi_compress_memory_needed(void) +{ + return 2 * PAGE_SIZE; +} + +static int toi_compress_storage_needed(void) +{ + return 2 * sizeof(unsigned long) + 2 * sizeof(int) + + strlen(toi_compressor_name) + 1; +} + +/* + * toi_compress_save_config_info + * @buffer: Pointer to a buffer of size PAGE_SIZE. + * + * Save informaton needed when reloading the image at resume time. + * Returns: Number of bytes used for saving our data. + */ +static int toi_compress_save_config_info(char *buffer) +{ + int len = strlen(toi_compressor_name) + 1, offset = 0; + + *((unsigned long *) buffer) = toi_compress_bytes_in; + offset += sizeof(unsigned long); + *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out; + offset += sizeof(unsigned long); + *((int *) (buffer + offset)) = toi_expected_compression; + offset += sizeof(int); + *((int *) (buffer + offset)) = len; + offset += sizeof(int); + strncpy(buffer + offset, toi_compressor_name, len); + return offset + len; +} + +/* toi_compress_load_config_info + * @buffer: Pointer to the start of the data. + * @size: Number of bytes that were saved. + * + * Description: Reload information needed for decompressing the image at + * resume time. + */ +static void toi_compress_load_config_info(char *buffer, int size) +{ + int len, offset = 0; + + toi_compress_bytes_in = *((unsigned long *) buffer); + offset += sizeof(unsigned long); + toi_compress_bytes_out = *((unsigned long *) (buffer + offset)); + offset += sizeof(unsigned long); + toi_expected_compression = *((int *) (buffer + offset)); + offset += sizeof(int); + len = *((int *) (buffer + offset)); + offset += sizeof(int); + strncpy(toi_compressor_name, buffer + offset, len); +} + +static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd) +{ + bkd->compress_bytes_in = toi_compress_bytes_in; + bkd->compress_bytes_out = toi_compress_bytes_out; +} + +static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd) +{ + toi_compress_bytes_in = bkd->compress_bytes_in; + toi_compress_bytes_out = bkd->compress_bytes_out; +} + +/* + * toi_expected_compression_ratio + * + * Description: Returns the expected ratio between data passed into this module + * and the amount of data output when writing. + * Returns: 100 if the module is disabled. Otherwise the value set by the + * user via our sysfs entry. + */ + +static int toi_compress_expected_ratio(void) +{ + if (!toi_compression_ops.enabled) + return 100; + else + return 100 - toi_expected_compression; +} + +/* + * data for our sysfs entries. + */ +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression, + 0, 99, 0, NULL), + SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0, + NULL), + SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL), +}; + +/* + * Ops structure. + */ +static struct toi_module_ops toi_compression_ops = { + .type = FILTER_MODULE, + .name = "compression", + .directory = "compression", + .module = THIS_MODULE, + .initialise = toi_compress_init, + .memory_needed = toi_compress_memory_needed, + .print_debug_info = toi_compress_print_debug_stats, + .save_config_info = toi_compress_save_config_info, + .load_config_info = toi_compress_load_config_info, + .storage_needed = toi_compress_storage_needed, + .expected_compression = toi_compress_expected_ratio, + + .pre_atomic_restore = toi_compress_pre_atomic_restore, + .post_atomic_restore = toi_compress_post_atomic_restore, + + .rw_init = toi_compress_rw_init, + .rw_cleanup = toi_compress_rw_cleanup, + + .write_page = toi_compress_write_page, + .read_page = toi_compress_read_page, + + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +/* ---- Registration ---- */ + +static __init int toi_compress_load(void) +{ + return toi_register_module(&toi_compression_ops); +} + +late_initcall(toi_compress_load); diff -Nur linux-4.2/kernel/power/tuxonice_copy_before_write.c linux-4.2-pck/kernel/power/tuxonice_copy_before_write.c --- linux-4.2/kernel/power/tuxonice_copy_before_write.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_copy_before_write.c 2015-09-08 11:20:32.467011976 -0300 @@ -0,0 +1,240 @@ +/* + * kernel/power/tuxonice_copy_before_write.c + * + * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * Routines (apart from the fault handling code) to deal with allocating memory + * for copying pages before they are modified, restoring the contents and getting + * the contents written to disk. + */ + +#include +#include +#include +#include "tuxonice_alloc.h" +#include "tuxonice_modules.h" +#include "tuxonice_sysfs.h" +#include "tuxonice.h" + +DEFINE_PER_CPU(struct toi_cbw_state, toi_cbw_states); +#define CBWS_PER_PAGE (PAGE_SIZE / sizeof(struct toi_cbw)) +#define toi_cbw_pool_size 100 + +static void _toi_free_cbw_data(struct toi_cbw_state *state) +{ + struct toi_cbw *page_ptr, *ptr, *next; + + page_ptr = ptr = state->first; + + while(ptr) { + next = ptr->next; + + if (ptr->virt) { + toi__free_page(40, virt_to_page(ptr->virt)); + } + if ((((unsigned long) ptr) & PAGE_MASK) != (unsigned long) page_ptr) { + /* Must be on a new page - free the previous one. */ + toi__free_page(40, virt_to_page(page_ptr)); + page_ptr = ptr; + } + ptr = next; + } + + if (page_ptr) { + toi__free_page(40, virt_to_page(page_ptr)); + } + + state->first = state->next = state->last = NULL; + state->size = 0; +} + +void toi_free_cbw_data(void) +{ + int i; + + for_each_online_cpu(i) { + struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); + + if (!state->first) + continue; + + state->enabled = 0; + + while (state->active) { + schedule(); + } + + _toi_free_cbw_data(state); + } +} + +static int _toi_allocate_cbw_data(struct toi_cbw_state *state) +{ + while(state->size < toi_cbw_pool_size) { + int i; + struct toi_cbw *ptr; + + ptr = (struct toi_cbw *) toi_get_zeroed_page(40, GFP_KERNEL); + + if (!ptr) { + return -ENOMEM; + } + + if (!state->first) { + state->first = state->next = state->last = ptr; + } + + for (i = 0; i < CBWS_PER_PAGE; i++) { + struct toi_cbw *cbw = &ptr[i]; + + cbw->virt = (char *) toi_get_zeroed_page(40, GFP_KERNEL); + if (!cbw->virt) { + state->size += i; + printk("Out of memory allocating CBW pages.\n"); + return -ENOMEM; + } + + if (cbw == state->first) + continue; + + state->last->next = cbw; + state->last = cbw; + } + + state->size += CBWS_PER_PAGE; + } + + state->enabled = 1; + + return 0; +} + + +int toi_allocate_cbw_data(void) +{ + int i, result; + + for_each_online_cpu(i) { + struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); + + result = _toi_allocate_cbw_data(state); + + if (result) + return result; + } + + return 0; +} + +void toi_cbw_restore(void) +{ + if (!toi_keeping_image) + return; + +} + +void toi_cbw_write(void) +{ + if (!toi_keeping_image) + return; + +} + +/** + * toi_cbw_test_read - Test copy before write on one page + * + * Allocate copy before write buffers, then make one page only copy-before-write + * and attempt to write to it. We should then be able to retrieve the original + * version from the cbw buffer and the modified version from the page itself. + */ +static int toi_cbw_test_read(const char *buffer, int count) +{ + unsigned long virt = toi_get_zeroed_page(40, GFP_KERNEL); + char *original = "Original contents"; + char *modified = "Modified material"; + struct page *page = virt_to_page(virt); + int i, len = 0, found = 0, pfn = page_to_pfn(page); + + if (!page) { + printk("toi_cbw_test_read: Unable to allocate a page for testing.\n"); + return -ENOMEM; + } + + memcpy((char *) virt, original, strlen(original)); + + if (toi_allocate_cbw_data()) { + printk("toi_cbw_test_read: Unable to allocate cbw data.\n"); + return -ENOMEM; + } + + toi_reset_dirtiness_one(pfn, 0); + + SetPageTOI_CBW(page); + + memcpy((char *) virt, modified, strlen(modified)); + + if (strncmp((char *) virt, modified, strlen(modified))) { + len += sprintf((char *) buffer + len, "Failed to write to page after protecting it.\n"); + } + + for_each_online_cpu(i) { + struct toi_cbw_state *state = &per_cpu(toi_cbw_states, i); + struct toi_cbw *ptr = state->first, *last_ptr = ptr; + + if (!found) { + while (ptr) { + if (ptr->pfn == pfn) { + found = 1; + if (strncmp(ptr->virt, original, strlen(original))) { + len += sprintf((char *) buffer + len, "Contents of original buffer are not original.\n"); + } else { + len += sprintf((char *) buffer + len, "Test passed. Buffer changed and original contents preserved.\n"); + } + break; + } + + last_ptr = ptr; + ptr = ptr->next; + } + } + + if (!last_ptr) + len += sprintf((char *) buffer + len, "All available CBW buffers on cpu %d used.\n", i); + } + + if (!found) + len += sprintf((char *) buffer + len, "Copy before write buffer not found.\n"); + + toi_free_cbw_data(); + + return len; +} + +/* + * This array contains entries that are automatically registered at + * boot. Modules and the console code register their own entries separately. + */ +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_CUSTOM("test", SYSFS_RW, toi_cbw_test_read, + NULL, SYSFS_NEEDS_SM_FOR_READ, NULL), +}; + +static struct toi_module_ops toi_cbw_ops = { + .type = MISC_HIDDEN_MODULE, + .name = "copy_before_write debugging", + .directory = "cbw", + .module = THIS_MODULE, + .early = 1, + + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +int toi_cbw_init(void) +{ + int result = toi_register_module(&toi_cbw_ops); + return result; +} diff -Nur linux-4.2/kernel/power/tuxonice_extent.c linux-4.2-pck/kernel/power/tuxonice_extent.c --- linux-4.2/kernel/power/tuxonice_extent.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_extent.c 2015-09-08 11:20:32.467011976 -0300 @@ -0,0 +1,144 @@ +/* + * kernel/power/tuxonice_extent.c + * + * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * Distributed under GPLv2. + * + * These functions encapsulate the manipulation of storage metadata. + */ + +#include +#include "tuxonice_modules.h" +#include "tuxonice_extent.h" +#include "tuxonice_alloc.h" +#include "tuxonice_ui.h" +#include "tuxonice.h" + +/** + * toi_get_extent - return a free extent + * + * May fail, returning NULL instead. + **/ +static struct hibernate_extent *toi_get_extent(void) +{ + return (struct hibernate_extent *) toi_kzalloc(2, + sizeof(struct hibernate_extent), TOI_ATOMIC_GFP); +} + +/** + * toi_put_extent_chain - free a chain of extents starting from value 'from' + * @chain: Chain to free. + * + * Note that 'from' is an extent value, and may be part way through an extent. + * In this case, the extent should be truncated (if necessary) and following + * extents freed. + **/ +void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from) +{ + struct hibernate_extent *this; + + this = chain->first; + + while (this) { + struct hibernate_extent *next = this->next; + + // Delete the whole extent? + if (this->start >= from) { + chain->size -= (this->end - this->start + 1); + if (chain->first == this) + chain->first = next; + if (chain->last_touched == this) + chain->last_touched = NULL; + if (chain->current_extent == this) + chain->current_extent = NULL; + toi_kfree(2, this, sizeof(*this)); + chain->num_extents--; + } else if (this->end >= from) { + // Delete part of the extent + chain->size -= (this->end - from + 1); + this->start = from; + } + this = next; + } +} + +/** + * toi_put_extent_chain - free a whole chain of extents + * @chain: Chain to free. + **/ +void toi_put_extent_chain(struct hibernate_extent_chain *chain) +{ + toi_put_extent_chain_from(chain, 0); +} + +/** + * toi_add_to_extent_chain - add an extent to an existing chain + * @chain: Chain to which the extend should be added + * @start: Start of the extent (first physical block) + * @end: End of the extent (last physical block) + * + * The chain information is updated if the insertion is successful. + **/ +int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, + unsigned long start, unsigned long end) +{ + struct hibernate_extent *new_ext = NULL, *cur_ext = NULL; + + toi_message(TOI_IO, TOI_VERBOSE, 0, + "Adding extent %lu-%lu to chain %p.\n", start, end, chain); + + /* Find the right place in the chain */ + if (chain->last_touched && chain->last_touched->start < start) + cur_ext = chain->last_touched; + else if (chain->first && chain->first->start < start) + cur_ext = chain->first; + + if (cur_ext) { + while (cur_ext->next && cur_ext->next->start < start) + cur_ext = cur_ext->next; + + if (cur_ext->end == (start - 1)) { + struct hibernate_extent *next_ext = cur_ext->next; + cur_ext->end = end; + + /* Merge with the following one? */ + if (next_ext && cur_ext->end + 1 == next_ext->start) { + cur_ext->end = next_ext->end; + cur_ext->next = next_ext->next; + toi_kfree(2, next_ext, sizeof(*next_ext)); + chain->num_extents--; + } + + chain->last_touched = cur_ext; + chain->size += (end - start + 1); + + return 0; + } + } + + new_ext = toi_get_extent(); + if (!new_ext) { + printk(KERN_INFO "Error unable to append a new extent to the " + "chain.\n"); + return -ENOMEM; + } + + chain->num_extents++; + chain->size += (end - start + 1); + new_ext->start = start; + new_ext->end = end; + + chain->last_touched = new_ext; + + if (cur_ext) { + new_ext->next = cur_ext->next; + cur_ext->next = new_ext; + } else { + if (chain->first) + new_ext->next = chain->first; + chain->first = new_ext; + } + + return 0; +} diff -Nur linux-4.2/kernel/power/tuxonice_extent.h linux-4.2-pck/kernel/power/tuxonice_extent.h --- linux-4.2/kernel/power/tuxonice_extent.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_extent.h 2015-09-08 11:20:32.467011976 -0300 @@ -0,0 +1,45 @@ +/* + * kernel/power/tuxonice_extent.h + * + * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * It contains declarations related to extents. Extents are + * TuxOnIce's method of storing some of the metadata for the image. + * See tuxonice_extent.c for more info. + * + */ + +#include "tuxonice_modules.h" + +#ifndef EXTENT_H +#define EXTENT_H + +struct hibernate_extent { + unsigned long start, end; + struct hibernate_extent *next; +}; + +struct hibernate_extent_chain { + unsigned long size; /* size of the chain ie sum (max-min+1) */ + int num_extents; + struct hibernate_extent *first, *last_touched; + struct hibernate_extent *current_extent; + unsigned long current_offset; +}; + +/* Simplify iterating through all the values in an extent chain */ +#define toi_extent_for_each(extent_chain, extentpointer, value) \ +if ((extent_chain)->first) \ + for ((extentpointer) = (extent_chain)->first, (value) = \ + (extentpointer)->start; \ + ((extentpointer) && ((extentpointer)->next || (value) <= \ + (extentpointer)->end)); \ + (((value) == (extentpointer)->end) ? \ + ((extentpointer) = (extentpointer)->next, (value) = \ + ((extentpointer) ? (extentpointer)->start : 0)) : \ + (value)++)) + +extern void toi_put_extent_chain_from(struct hibernate_extent_chain *chain, unsigned long from); +#endif diff -Nur linux-4.2/kernel/power/tuxonice_file.c linux-4.2-pck/kernel/power/tuxonice_file.c --- linux-4.2/kernel/power/tuxonice_file.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_file.c 2015-09-08 11:20:32.467011976 -0300 @@ -0,0 +1,484 @@ +/* + * kernel/power/tuxonice_file.c + * + * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * Distributed under GPLv2. + * + * This file encapsulates functions for usage of a simple file as a + * backing store. It is based upon the swapallocator, and shares the + * same basic working. Here, though, we have nothing to do with + * swapspace, and only one device to worry about. + * + * The user can just + * + * echo TuxOnIce > /path/to/my_file + * + * dd if=/dev/zero bs=1M count= >> /path/to/my_file + * + * and + * + * echo /path/to/my_file > /sys/power/tuxonice/file/target + * + * then put what they find in /sys/power/tuxonice/resume + * as their resume= parameter in lilo.conf (and rerun lilo if using it). + * + * Having done this, they're ready to hibernate and resume. + * + * TODO: + * - File resizing. + */ + +#include +#include +#include +#include + +#include "tuxonice.h" +#include "tuxonice_modules.h" +#include "tuxonice_bio.h" +#include "tuxonice_alloc.h" +#include "tuxonice_builtin.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_ui.h" +#include "tuxonice_io.h" + +#define target_is_normal_file() (S_ISREG(target_inode->i_mode)) + +static struct toi_module_ops toi_fileops; + +static struct file *target_file; +static struct block_device *toi_file_target_bdev; +static unsigned long pages_available, pages_allocated; +static char toi_file_target[256]; +static struct inode *target_inode; +static int file_target_priority; +static int used_devt; +static int target_claim; +static dev_t toi_file_dev_t; +static int sig_page_index; + +/* For test_toi_file_target */ +static struct toi_bdev_info *file_chain; + +static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num) +{ + int j; + sector_t last = 0; + + for (j = 0; j < dev_info->blocks_per_page; j++) { + sector_t this = bmap(target_inode, + page_num * dev_info->blocks_per_page + j); + + if (!this || (last && (last + 1) != this)) + break; + + last = this; + } + + return j == dev_info->blocks_per_page; +} + +static unsigned long get_usable_pages(struct toi_bdev_info *dev_info) +{ + unsigned long result = 0; + struct block_device *bdev = dev_info->bdev; + int i; + + switch (target_inode->i_mode & S_IFMT) { + case S_IFSOCK: + case S_IFCHR: + case S_IFIFO: /* Socket, Char, Fifo */ + return -1; + case S_IFREG: /* Regular file: current size - holes + free + space on part */ + for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) { + if (has_contiguous_blocks(dev_info, i)) + result++; + } + break; + case S_IFBLK: /* Block device */ + if (!bdev->bd_disk) { + toi_message(TOI_IO, TOI_VERBOSE, 0, + "bdev->bd_disk null."); + return 0; + } + + result = (bdev->bd_part ? + bdev->bd_part->nr_sects : + get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9); + } + + + return result; +} + +static int toi_file_register_storage(void) +{ + struct toi_bdev_info *devinfo; + int result = 0; + struct fs_info *fs_info; + + toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage."); + if (!strlen(toi_file_target)) { + toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: " + "No target filename set."); + return 0; + } + + target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0); + toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.", + toi_file_target, target_file); + + if (IS_ERR(target_file) || !target_file) { + target_file = NULL; + toi_file_dev_t = name_to_dev_t(toi_file_target); + if (!toi_file_dev_t) { + struct kstat stat; + int error = vfs_stat(toi_file_target, &stat); + printk(KERN_INFO "Open file %s returned %p and " + "name_to_devt failed.\n", + toi_file_target, target_file); + if (error) { + printk(KERN_INFO "Stating the file also failed." + " Nothing more we can do.\n"); + return 0; + } else + toi_file_dev_t = stat.rdev; + } + + toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t); + if (IS_ERR(toi_file_target_bdev)) { + printk(KERN_INFO "Got a dev_num (%lx) but failed to " + "open it.\n", + (unsigned long) toi_file_dev_t); + toi_file_target_bdev = NULL; + return 0; + } + used_devt = 1; + target_inode = toi_file_target_bdev->bd_inode; + } else + target_inode = target_file->f_mapping->host; + + toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target."); + if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) || + S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) { + printk(KERN_INFO "File support works with regular files," + " character files and block devices.\n"); + /* Cleanup routine will undo the above */ + return 0; + } + + if (!used_devt) { + if (S_ISBLK(target_inode->i_mode)) { + toi_file_target_bdev = I_BDEV(target_inode); + if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE | + FMODE_READ, NULL)) + target_claim = 1; + } else + toi_file_target_bdev = target_inode->i_sb->s_bdev; + if (!toi_file_target_bdev) { + printk(KERN_INFO "%s is not a valid file allocator " + "target.\n", toi_file_target); + return 0; + } + toi_file_dev_t = toi_file_target_bdev->bd_dev; + } + + devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC); + if (!devinfo) { + printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n"); + return -ENOMEM; + } + + devinfo->bdev = toi_file_target_bdev; + devinfo->allocator = &toi_fileops; + devinfo->allocator_index = 0; + + fs_info = fs_info_from_block_dev(toi_file_target_bdev); + if (fs_info && !IS_ERR(fs_info)) { + memcpy(devinfo->uuid, &fs_info->uuid, 16); + free_fs_info(fs_info); + } else + result = (int) PTR_ERR(fs_info); + + /* Unlike swap code, only complain if fs_info_from_block_dev returned + * -ENOMEM. The 'file' might be a full partition, so might validly not + * have an identifiable type, UUID etc. + */ + if (result) + printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n", + result); + devinfo->dev_t = toi_file_dev_t; + devinfo->prio = file_target_priority; + devinfo->bmap_shift = target_inode->i_blkbits - 9; + devinfo->blocks_per_page = + (1 << (PAGE_SHIFT - target_inode->i_blkbits)); + sprintf(devinfo->name, "file %s", toi_file_target); + file_chain = devinfo; + toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap " + "shift is %d. Blocks per page %d.", + devinfo->dev_t, devinfo->prio, devinfo->bmap_shift, + devinfo->blocks_per_page); + + /* Keep one aside for the signature */ + pages_available = get_usable_pages(devinfo) - 1; + + toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu " + "pages.", pages_available); + + toi_bio_ops.register_storage(devinfo); + return 0; +} + +static unsigned long toi_file_storage_available(void) +{ + return pages_available; +} + +static int toi_file_allocate_storage(struct toi_bdev_info *chain, + unsigned long request) +{ + unsigned long available = pages_available - pages_allocated; + unsigned long to_add = min(available, request); + + toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated " + "is %lu. Allocating %lu pages from file.", + pages_available, pages_allocated, to_add); + pages_allocated += to_add; + + return to_add; +} + +/** + * __populate_block_list - add an extent to the chain + * @min: Start of the extent (first physical block = sector) + * @max: End of the extent (last physical block = sector) + * + * If TOI_TEST_BIO is set, print a debug message, outputting the min and max + * fs block numbers. + **/ +static int __populate_block_list(struct toi_bdev_info *chain, int min, int max) +{ + if (test_action_state(TOI_TEST_BIO)) + toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.", + min << chain->bmap_shift, + ((max + 1) << chain->bmap_shift) - 1); + + return toi_add_to_extent_chain(&chain->blocks, min, max); +} + +static int get_main_pool_phys_params(struct toi_bdev_info *chain) +{ + int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0; + unsigned long pages_mapped = 0; + + toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks."); + + if (chain->blocks.first) + toi_put_extent_chain(&chain->blocks); + + if (!target_is_normal_file()) { + result = (pages_available > 0) ? + __populate_block_list(chain, chain->blocks_per_page, + (pages_allocated + 1) * + chain->blocks_per_page - 1) : 0; + return result; + } + + /* + * FIXME: We are assuming the first page is contiguous. Is that + * assumption always right? + */ + + for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) { + sector_t new_sector; + + if (!has_contiguous_blocks(chain, i)) + continue; + + if (!have_sig_page) { + have_sig_page = 1; + sig_page_index = i; + continue; + } + + pages_mapped++; + + /* Ignore first page - it has the header */ + if (pages_mapped == 1) + continue; + + new_sector = bmap(target_inode, (i * chain->blocks_per_page)); + + /* + * I'd love to be able to fill in holes and resize + * files, but not yet... + */ + + if (new_sector == extent_max + 1) + extent_max += chain->blocks_per_page; + else { + if (extent_min > -1) { + result = __populate_block_list(chain, + extent_min, extent_max); + if (result) + return result; + } + + extent_min = new_sector; + extent_max = extent_min + + chain->blocks_per_page - 1; + } + + if (pages_mapped == pages_allocated) + break; + } + + if (extent_min > -1) { + result = __populate_block_list(chain, extent_min, extent_max); + if (result) + return result; + } + + return 0; +} + +static void toi_file_free_storage(struct toi_bdev_info *chain) +{ + pages_allocated = 0; + file_chain = NULL; +} + +/** + * toi_file_print_debug_stats - print debug info + * @buffer: Buffer to data to populate + * @size: Size of the buffer + **/ +static int toi_file_print_debug_stats(char *buffer, int size) +{ + int len = scnprintf(buffer, size, "- File Allocator active.\n"); + + len += scnprintf(buffer+len, size-len, " Storage available for " + "image: %lu pages.\n", pages_available); + + return len; +} + +static void toi_file_cleanup(int finishing_cycle) +{ + if (toi_file_target_bdev) { + if (target_claim) { + blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ); + target_claim = 0; + } + + if (used_devt) { + blkdev_put(toi_file_target_bdev, + FMODE_READ | FMODE_NDELAY); + used_devt = 0; + } + toi_file_target_bdev = NULL; + target_inode = NULL; + } + + if (target_file) { + filp_close(target_file, NULL); + target_file = NULL; + } + + pages_available = 0; +} + +/** + * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target + * + * Test wheter the target file is valid for hibernating. + **/ +static void test_toi_file_target(void) +{ + int result = toi_file_register_storage(); + sector_t sector; + char buf[50]; + struct fs_info *fs_info; + + if (result || !file_chain) + return; + + /* This doesn't mean we're in business. Is any storage available? */ + if (!pages_available) + goto out; + + toi_file_allocate_storage(file_chain, 1); + result = get_main_pool_phys_params(file_chain); + if (result) + goto out; + + + sector = bmap(target_inode, sig_page_index * + file_chain->blocks_per_page) << file_chain->bmap_shift; + + /* Use the uuid, or the dev_t if that fails */ + fs_info = fs_info_from_block_dev(toi_file_target_bdev); + if (!fs_info || IS_ERR(fs_info)) { + bdevname(toi_file_target_bdev, buf); + sprintf(resume_file, "/dev/%s:%llu", buf, + (unsigned long long) sector); + } else { + int i; + hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0); + + /* Remove the spaces */ + for (i = 1; i < 16; i++) { + buf[2 * i] = buf[3 * i]; + buf[2 * i + 1] = buf[3 * i + 1]; + } + buf[32] = 0; + sprintf(resume_file, "UUID=%s:0x%llx", buf, + (unsigned long long) sector); + free_fs_info(fs_info); + } + + toi_attempt_to_parse_resume_device(0); +out: + toi_file_free_storage(file_chain); + toi_bio_ops.free_storage(); +} + +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256, + SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target), + SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL), + SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095, + 4096, 0, NULL), +}; + +static struct toi_bio_allocator_ops toi_bio_fileops = { + .register_storage = toi_file_register_storage, + .storage_available = toi_file_storage_available, + .allocate_storage = toi_file_allocate_storage, + .bmap = get_main_pool_phys_params, + .free_storage = toi_file_free_storage, +}; + +static struct toi_module_ops toi_fileops = { + .type = BIO_ALLOCATOR_MODULE, + .name = "file storage", + .directory = "file", + .module = THIS_MODULE, + .print_debug_info = toi_file_print_debug_stats, + .cleanup = toi_file_cleanup, + .bio_allocator_ops = &toi_bio_fileops, + + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +/* ---- Registration ---- */ +static __init int toi_file_load(void) +{ + return toi_register_module(&toi_fileops); +} + +late_initcall(toi_file_load); diff -Nur linux-4.2/kernel/power/tuxonice_highlevel.c linux-4.2-pck/kernel/power/tuxonice_highlevel.c --- linux-4.2/kernel/power/tuxonice_highlevel.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_highlevel.c 2015-09-08 11:20:32.467011976 -0300 @@ -0,0 +1,1413 @@ +/* + * kernel/power/tuxonice_highlevel.c + */ +/** \mainpage TuxOnIce. + * + * TuxOnIce provides support for saving and restoring an image of + * system memory to an arbitrary storage device, either on the local computer, + * or across some network. The support is entirely OS based, so TuxOnIce + * works without requiring BIOS, APM or ACPI support. The vast majority of the + * code is also architecture independant, so it should be very easy to port + * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem + * and preemption. Initramfses and initrds are also supported. + * + * TuxOnIce uses a modular design, in which the method of storing the image is + * completely abstracted from the core code, as are transformations on the data + * such as compression and/or encryption (multiple 'modules' can be used to + * provide arbitrary combinations of functionality). The user interface is also + * modular, so that arbitrarily simple or complex interfaces can be used to + * provide anything from debugging information through to eye candy. + * + * \section Copyright + * + * TuxOnIce is released under the GPLv2. + * + * Copyright (C) 1998-2001 Gabor Kuti
+ * Copyright (C) 1998,2001,2002 Pavel Machek
+ * Copyright (C) 2002-2003 Florent Chabaud
+ * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au)
+ * + * \section Credits + * + * Nigel would like to thank the following people for their work: + * + * Bernard Blackham
+ * Web page & Wiki administration, some coding. A person without whom + * TuxOnIce would not be where it is. + * + * Michael Frank
+ * Extensive testing and help with improving stability. I was constantly + * amazed by the quality and quantity of Michael's help. + * + * Pavel Machek
+ * Modifications, defectiveness pointing, being with Gabor at the very + * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and + * 2.5.17. Even though Pavel and I disagree on the direction suspend to + * disk should take, I appreciate the valuable work he did in helping Gabor + * get the concept working. + * + * ..and of course the myriads of TuxOnIce users who have helped diagnose + * and fix bugs, made suggestions on how to improve the code, proofread + * documentation, and donated time and money. + * + * Thanks also to corporate sponsors: + * + * Redhat.Sometime employer from May 2006 (my fault, not Redhat's!). + * + * Cyclades.com. Nigel's employers from Dec 2004 until May 2006, who + * allowed him to work on TuxOnIce and PM related issues on company time. + * + * LinuxFund.org. Sponsored Nigel's work on TuxOnIce for four months Oct + * 2003 to Jan 2004. + * + * LAC Linux. Donated P4 hardware that enabled development and ongoing + * maintenance of SMP and Highmem support. + * + * OSDL. Provided access to various hardware configurations, make + * occasional small donations to the project. + */ + +#include +#include +#include +#include +#include +#include +#include +#include /* for get/set_fs & KERNEL_DS on i386 */ +#include +#include + +#include "tuxonice.h" +#include "tuxonice_modules.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_prepare_image.h" +#include "tuxonice_io.h" +#include "tuxonice_ui.h" +#include "tuxonice_power_off.h" +#include "tuxonice_storage.h" +#include "tuxonice_checksum.h" +#include "tuxonice_builtin.h" +#include "tuxonice_atomic_copy.h" +#include "tuxonice_alloc.h" +#include "tuxonice_cluster.h" + +/*! Pageset metadata. */ +struct pagedir pagedir2 = {2}; + +static mm_segment_t oldfs; +static DEFINE_MUTEX(tuxonice_in_use); +static int block_dump_save; + +int toi_trace_index; + +/* Binary signature if an image is present */ +char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c"; + +unsigned long boot_kernel_data_buffer; + +static char *result_strings[] = { + "Hibernation was aborted", + "The user requested that we cancel the hibernation", + "No storage was available", + "Insufficient storage was available", + "Freezing filesystems and/or tasks failed", + "A pre-existing image was used", + "We would free memory, but image size limit doesn't allow this", + "Unable to free enough memory to hibernate", + "Unable to obtain the Power Management Semaphore", + "A device suspend/resume returned an error", + "A system device suspend/resume returned an error", + "The extra pages allowance is too small", + "We were unable to successfully prepare an image", + "TuxOnIce module initialisation failed", + "TuxOnIce module cleanup failed", + "I/O errors were encountered", + "Ran out of memory", + "An error was encountered while reading the image", + "Platform preparation failed", + "CPU Hotplugging failed", + "Architecture specific preparation failed", + "Pages needed resaving, but we were told to abort if this happens", + "We can't hibernate at the moment (invalid resume= or filewriter " + "target?)", + "A hibernation preparation notifier chain member cancelled the " + "hibernation", + "Pre-snapshot preparation failed", + "Pre-restore preparation failed", + "Failed to disable usermode helpers", + "Can't resume from alternate image", + "Header reservation too small", + "Device Power Management Preparation failed", +}; + +/** + * toi_finish_anything - cleanup after doing anything + * @hibernate_or_resume: Whether finishing a cycle or attempt at + * resuming. + * + * This is our basic clean-up routine, matching start_anything below. We + * call cleanup routines, drop module references and restore process fs and + * cpus allowed masks, together with the global block_dump variable's value. + **/ +void toi_finish_anything(int hibernate_or_resume) +{ + toi_running = 0; + toi_cleanup_modules(hibernate_or_resume); + toi_put_modules(); + if (hibernate_or_resume) { + block_dump = block_dump_save; + set_cpus_allowed_ptr(current, cpu_all_mask); + toi_alloc_print_debug_stats(); + atomic_inc(&snapshot_device_available); + unlock_system_sleep(); + } + + set_fs(oldfs); + mutex_unlock(&tuxonice_in_use); +} + +/** + * toi_start_anything - basic initialisation for TuxOnIce + * @toi_or_resume: Whether starting a cycle or attempt at resuming. + * + * Our basic initialisation routine. Take references on modules, use the + * kernel segment, recheck resume= if no active allocator is set, initialise + * modules, save and reset block_dump and ensure we're running on CPU0. + **/ +int toi_start_anything(int hibernate_or_resume) +{ + mutex_lock(&tuxonice_in_use); + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + toi_trace_index = 0; + + if (hibernate_or_resume) { + lock_system_sleep(); + + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) + goto snapshotdevice_unavailable; + } + + if (hibernate_or_resume == SYSFS_HIBERNATE) + toi_print_modules(); + + if (toi_get_modules()) { + printk(KERN_INFO "TuxOnIce: Get modules failed!\n"); + goto prehibernate_err; + } + + if (hibernate_or_resume) { + block_dump_save = block_dump; + block_dump = 0; + set_cpus_allowed_ptr(current, + cpumask_of(cpumask_first(cpu_online_mask))); + } + + if (toi_initialise_modules_early(hibernate_or_resume)) + goto early_init_err; + + if (!toiActiveAllocator) + toi_attempt_to_parse_resume_device(!hibernate_or_resume); + + if (!toi_initialise_modules_late(hibernate_or_resume)) { + toi_running = 1; /* For the swsusp code we use :< */ + return 0; + } + + toi_cleanup_modules(hibernate_or_resume); +early_init_err: + if (hibernate_or_resume) { + block_dump_save = block_dump; + set_cpus_allowed_ptr(current, cpu_all_mask); + } + toi_put_modules(); +prehibernate_err: + if (hibernate_or_resume) + atomic_inc(&snapshot_device_available); +snapshotdevice_unavailable: + if (hibernate_or_resume) + mutex_unlock(&pm_mutex); + set_fs(oldfs); + mutex_unlock(&tuxonice_in_use); + return -EBUSY; +} + +/* + * Nosave page tracking. + * + * Here rather than in prepare_image because we want to do it once only at the + * start of a cycle. + */ + +/** + * mark_nosave_pages - set up our Nosave bitmap + * + * Build a bitmap of Nosave pages from the list. The bitmap allows faster + * use when preparing the image. + **/ +static void mark_nosave_pages(void) +{ + struct nosave_region *region; + + list_for_each_entry(region, &nosave_regions, list) { + unsigned long pfn; + + for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) + if (pfn_valid(pfn)) { + SetPageNosave(pfn_to_page(pfn)); + } + } +} + +/** + * allocate_bitmaps - allocate bitmaps used to record page states + * + * Allocate the bitmaps we use to record the various TuxOnIce related + * page states. + **/ +static int allocate_bitmaps(void) +{ + if (toi_alloc_bitmap(&pageset1_map) || + toi_alloc_bitmap(&pageset1_copy_map) || + toi_alloc_bitmap(&pageset2_map) || + toi_alloc_bitmap(&io_map) || + toi_alloc_bitmap(&nosave_map) || + toi_alloc_bitmap(&free_map) || + toi_alloc_bitmap(&compare_map) || + toi_alloc_bitmap(&page_resave_map)) + return 1; + + return 0; +} + +/** + * free_bitmaps - free the bitmaps used to record page states + * + * Free the bitmaps allocated above. It is not an error to call + * memory_bm_free on a bitmap that isn't currently allocated. + **/ +static void free_bitmaps(void) +{ + toi_free_bitmap(&pageset1_map); + toi_free_bitmap(&pageset1_copy_map); + toi_free_bitmap(&pageset2_map); + toi_free_bitmap(&io_map); + toi_free_bitmap(&nosave_map); + toi_free_bitmap(&free_map); + toi_free_bitmap(&compare_map); + toi_free_bitmap(&page_resave_map); +} + +/** + * io_MB_per_second - return the number of MB/s read or written + * @write: Whether to return the speed at which we wrote. + * + * Calculate the number of megabytes per second that were read or written. + **/ +static int io_MB_per_second(int write) +{ + return (toi_bkd.toi_io_time[write][1]) ? + MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ / + toi_bkd.toi_io_time[write][1] : 0; +} + +#define SNPRINTF(a...) do { len += scnprintf(((char *) buffer) + len, \ + count - len - 1, ## a); } while (0) + +/** + * get_debug_info - fill a buffer with debugging information + * @buffer: The buffer to be filled. + * @count: The size of the buffer, in bytes. + * + * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will + * either printk or return via sysfs. + **/ +static int get_toi_debug_info(const char *buffer, int count) +{ + int len = 0, i, first_result = 1; + + SNPRINTF("TuxOnIce debugging info:\n"); + SNPRINTF("- TuxOnIce core : " TOI_CORE_VERSION "\n"); + SNPRINTF("- Kernel Version : " UTS_RELEASE "\n"); + SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__); + SNPRINTF("- Attempt number : %d\n", nr_hibernates); + SNPRINTF("- Parameters : %ld %ld %ld %d %ld %ld\n", + toi_result, + toi_bkd.toi_action, + toi_bkd.toi_debug_state, + toi_bkd.toi_default_console_level, + image_size_limit, + toi_poweroff_method); + SNPRINTF("- Overall expected compression percentage: %d.\n", + 100 - toi_expected_compression_ratio()); + len += toi_print_module_debug_info(((char *) buffer) + len, + count - len - 1); + if (toi_bkd.toi_io_time[0][1]) { + if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) { + SNPRINTF("- I/O speed: Write %ld KB/s", + (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / + toi_bkd.toi_io_time[0][1])); + if (toi_bkd.toi_io_time[1][1]) + SNPRINTF(", Read %ld KB/s", + (KB((unsigned long) + toi_bkd.toi_io_time[1][0]) * HZ / + toi_bkd.toi_io_time[1][1])); + } else { + SNPRINTF("- I/O speed: Write %ld MB/s", + (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / + toi_bkd.toi_io_time[0][1])); + if (toi_bkd.toi_io_time[1][1]) + SNPRINTF(", Read %ld MB/s", + (MB((unsigned long) + toi_bkd.toi_io_time[1][0]) * HZ / + toi_bkd.toi_io_time[1][1])); + } + SNPRINTF(".\n"); + } else + SNPRINTF("- No I/O speed stats available.\n"); + SNPRINTF("- Extra pages : %lu used/%lu.\n", + extra_pd1_pages_used, extra_pd1_pages_allowance); + + for (i = 0; i < TOI_NUM_RESULT_STATES; i++) + if (test_result_state(i)) { + SNPRINTF("%s: %s.\n", first_result ? + "- Result " : + " ", + result_strings[i]); + first_result = 0; + } + if (first_result) + SNPRINTF("- Result : %s.\n", nr_hibernates ? + "Succeeded" : + "No hibernation attempts so far"); + return len; +} + +#ifdef CONFIG_TOI_INCREMENTAL +/** + * get_toi_page_state - fill a buffer with page state information + * @buffer: The buffer to be filled. + * @count: The size of the buffer, in bytes. + * + * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will + * either printk or return via sysfs. + **/ +static int get_toi_page_state(const char *buffer, int count) +{ + int free = 0, untracked = 0, dirty = 0, ro = 0, invalid = 0, other = 0, total = 0; + int len = 0; + struct zone *zone; + int allocated_bitmaps = 0; + + set_cpus_allowed_ptr(current, + cpumask_of(cpumask_first(cpu_online_mask))); + + if (!free_map) { + BUG_ON(toi_alloc_bitmap(&free_map)); + allocated_bitmaps = 1; + } + + toi_generate_free_page_map(); + + for_each_populated_zone(zone) { + unsigned long loop; + + total += zone->spanned_pages; + + for (loop = 0; loop < zone->spanned_pages; loop++) { + unsigned long pfn = zone->zone_start_pfn + loop; + struct page *page; + int chunk_size; + + if (!pfn_valid(pfn)) { + continue; + } + + chunk_size = toi_size_of_free_region(zone, pfn); + if (chunk_size) { + /* + * If the page gets allocated, it will be need + * saving in an image. + * Don't bother with explicitly removing any + * RO protection applied below. + * We'll SetPageTOI_Dirty(page) if/when it + * gets allocated. + */ + free += chunk_size; + loop += chunk_size - 1; + continue; + } + + page = pfn_to_page(pfn); + + if (PageTOI_Untracked(page)) { + untracked++; + } else if (PageTOI_RO(page)) { + ro++; + } else if (PageTOI_Dirty(page)) { + dirty++; + } else { + printk("Page %ld state 'other'.\n", pfn); + other++; + } + } + } + + if (allocated_bitmaps) { + toi_free_bitmap(&free_map); + } + + set_cpus_allowed_ptr(current, cpu_all_mask); + + SNPRINTF("TuxOnIce page breakdown:\n"); + SNPRINTF("- Free : %d\n", free); + SNPRINTF("- Untracked : %d\n", untracked); + SNPRINTF("- Read only : %d\n", ro); + SNPRINTF("- Dirty : %d\n", dirty); + SNPRINTF("- Other : %d\n", other); + SNPRINTF("- Invalid : %d\n", invalid); + SNPRINTF("- Total : %d\n", total); + return len; +} +#endif + +/** + * do_cleanup - cleanup after attempting to hibernate or resume + * @get_debug_info: Whether to allocate and return debugging info. + * + * Cleanup after attempting to hibernate or resume, possibly getting + * debugging info as we do so. + **/ +static void do_cleanup(int get_debug_info, int restarting) +{ + int i = 0; + char *buffer = NULL; + + trap_non_toi_io = 0; + + if (get_debug_info) + toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up..."); + + free_checksum_pages(); + + toi_cbw_restore(); + toi_free_cbw_data(); + + if (get_debug_info) + buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP); + + if (buffer) + i = get_toi_debug_info(buffer, PAGE_SIZE); + + toi_free_extra_pagedir_memory(); + + pagedir1.size = 0; + pagedir2.size = 0; + set_highmem_size(pagedir1, 0); + set_highmem_size(pagedir2, 0); + + if (boot_kernel_data_buffer) { + if (!test_toi_state(TOI_BOOT_KERNEL)) + toi_free_page(37, boot_kernel_data_buffer); + boot_kernel_data_buffer = 0; + } + + if (test_toi_state(TOI_DEVICE_HOTPLUG_LOCKED)) { + unlock_device_hotplug(); + clear_toi_state(TOI_DEVICE_HOTPLUG_LOCKED); + } + + clear_toi_state(TOI_BOOT_KERNEL); + if (current->flags & PF_SUSPEND_TASK) + thaw_processes(); + + if (!restarting) + toi_stop_other_threads(); + + if (toi_keeping_image && + !test_result_state(TOI_ABORTED)) { + toi_message(TOI_ANY_SECTION, TOI_LOW, 1, + "TuxOnIce: Not invalidating the image due " + "to Keep Image or Incremental Image being enabled."); + set_result_state(TOI_KEPT_IMAGE); + + /* + * For an incremental image, free unused storage so + * swap (if any) can be used for normal system operation, + * if so desired. + */ + + toiActiveAllocator->free_unused_storage(); + } else + if (toiActiveAllocator) + toiActiveAllocator->remove_image(); + + free_bitmaps(); + usermodehelper_enable(); + + if (test_toi_state(TOI_NOTIFIERS_PREPARE)) { + pm_notifier_call_chain(PM_POST_HIBERNATION); + clear_toi_state(TOI_NOTIFIERS_PREPARE); + } + + if (buffer && i) { + /* Printk can only handle 1023 bytes, including + * its level mangling. */ + for (i = 0; i < 3; i++) + printk(KERN_ERR "%s", buffer + (1023 * i)); + toi_free_page(20, (unsigned long) buffer); + } + + if (!restarting) + toi_cleanup_console(); + + free_attention_list(); + + if (!restarting) + toi_deactivate_storage(0); + + clear_toi_state(TOI_IGNORE_LOGLEVEL); + clear_toi_state(TOI_TRYING_TO_RESUME); + clear_toi_state(TOI_NOW_RESUMING); +} + +/** + * check_still_keeping_image - we kept an image; check whether to reuse it. + * + * We enter this routine when we have kept an image. If the user has said they + * want to still keep it, all we need to do is powerdown. If powering down + * means hibernating to ram and the power doesn't run out, we'll return 1. + * If we do power off properly or the battery runs out, we'll resume via the + * normal paths. + * + * If the user has said they want to remove the previously kept image, we + * remove it, and return 0. We'll then store a new image. + **/ +static int check_still_keeping_image(void) +{ + if (toi_keeping_image) { + if (!test_action_state(TOI_INCREMENTAL_IMAGE)) { + printk(KERN_INFO "Image already stored: powering down " + "immediately."); + do_toi_step(STEP_HIBERNATE_POWERDOWN); + return 1; + } + /** + * Incremental image - need to write new part. + * We detect that we're writing an incremental image by looking + * at test_result_state(TOI_KEPT_IMAGE) + **/ + return 0; + } + + printk(KERN_INFO "Invalidating previous image.\n"); + toiActiveAllocator->remove_image(); + + return 0; +} + +/** + * toi_init - prepare to hibernate to disk + * + * Initialise variables & data structures, in preparation for + * hibernating to disk. + **/ +static int toi_init(int restarting) +{ + int result, i, j; + + toi_result = 0; + + printk(KERN_INFO "Initiating a hibernation cycle.\n"); + + nr_hibernates++; + + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + toi_bkd.toi_io_time[i][j] = 0; + + if (!test_toi_state(TOI_CAN_HIBERNATE) || + allocate_bitmaps()) + return 1; + + mark_nosave_pages(); + + if (!restarting) + toi_prepare_console(); + + result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + if (result) { + set_result_state(TOI_NOTIFIERS_PREPARE_FAILED); + return 1; + } + set_toi_state(TOI_NOTIFIERS_PREPARE); + + if (!restarting) { + printk(KERN_ERR "Starting other threads."); + toi_start_other_threads(); + } + + result = usermodehelper_disable(); + if (result) { + printk(KERN_ERR "TuxOnIce: Failed to disable usermode " + "helpers\n"); + set_result_state(TOI_USERMODE_HELPERS_ERR); + return 1; + } + + boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP); + if (!boot_kernel_data_buffer) { + printk(KERN_ERR "TuxOnIce: Failed to allocate " + "boot_kernel_data_buffer.\n"); + set_result_state(TOI_OUT_OF_MEMORY); + return 1; + } + + toi_allocate_cbw_data(); + + return 0; +} + +/** + * can_hibernate - perform basic 'Can we hibernate?' tests + * + * Perform basic tests that must pass if we're going to be able to hibernate: + * Can we get the pm_mutex? Is resume= valid (we need to know where to write + * the image header). + **/ +static int can_hibernate(void) +{ + if (!test_toi_state(TOI_CAN_HIBERNATE)) + toi_attempt_to_parse_resume_device(0); + + if (!test_toi_state(TOI_CAN_HIBERNATE)) { + printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n" + "This may be because you haven't put something along " + "the lines of\n\nresume=swap:/dev/hda1\n\n" + "in lilo.conf or equivalent. (Where /dev/hda1 is your " + "swap partition).\n"); + set_abort_result(TOI_CANT_SUSPEND); + return 0; + } + + if (strlen(alt_resume_param)) { + attempt_to_parse_alt_resume_param(); + + if (!strlen(alt_resume_param)) { + printk(KERN_INFO "Alternate resume parameter now " + "invalid. Aborting.\n"); + set_abort_result(TOI_CANT_USE_ALT_RESUME); + return 0; + } + } + + return 1; +} + +/** + * do_post_image_write - having written an image, figure out what to do next + * + * After writing an image, we might load an alternate image or power down. + * Powering down might involve hibernating to ram, in which case we also + * need to handle reloading pageset2. + **/ +static int do_post_image_write(void) +{ + /* If switching images fails, do normal powerdown */ + if (alt_resume_param[0]) + do_toi_step(STEP_RESUME_ALT_IMAGE); + + toi_power_down(); + + barrier(); + mb(); + return 0; +} + +/** + * __save_image - do the hard work of saving the image + * + * High level routine for getting the image saved. The key assumptions made + * are that processes have been frozen and sufficient memory is available. + * + * We also exit through here at resume time, coming back from toi_hibernate + * after the atomic restore. This is the reason for the toi_in_hibernate + * test. + **/ +static int __save_image(void) +{ + int temp_result, did_copy = 0; + + toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image.."); + + toi_message(TOI_ANY_SECTION, TOI_LOW, 1, + " - Final values: %d and %d.", + pagedir1.size, pagedir2.size); + + toi_cond_pause(1, "About to write pagedir2."); + + temp_result = write_pageset(&pagedir2); + + if (temp_result == -1 || test_result_state(TOI_ABORTED)) + return 1; + + toi_cond_pause(1, "About to copy pageset 1."); + + if (test_result_state(TOI_ABORTED)) + return 1; + + toi_deactivate_storage(1); + + toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); + + toi_in_hibernate = 1; + + if (toi_go_atomic(PMSG_FREEZE, 1)) + goto Failed; + + temp_result = toi_hibernate(); + +#ifdef CONFIG_KGDB + if (test_action_state(TOI_POST_RESUME_BREAKPOINT)) + kgdb_breakpoint(); +#endif + + if (!temp_result) + did_copy = 1; + + /* We return here at resume time too! */ + toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result); + +Failed: + if (toi_activate_storage(1)) + panic("Failed to reactivate our storage."); + + /* Resume time? */ + if (!toi_in_hibernate) { + copyback_post(); + return 0; + } + + /* Nope. Hibernating. So, see if we can save the image... */ + + if (temp_result || test_result_state(TOI_ABORTED)) { + if (did_copy) + goto abort_reloading_pagedir_two; + else + return 1; + } + + toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size, + NULL); + + if (test_result_state(TOI_ABORTED)) + goto abort_reloading_pagedir_two; + + toi_cond_pause(1, "About to write pageset1."); + + toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1"); + + temp_result = write_pageset(&pagedir1); + + /* We didn't overwrite any memory, so no reread needs to be done. */ + if (test_action_state(TOI_TEST_FILTER_SPEED) || + test_action_state(TOI_TEST_BIO)) + return 1; + + if (temp_result == 1 || test_result_state(TOI_ABORTED)) + goto abort_reloading_pagedir_two; + + toi_cond_pause(1, "About to write header."); + + if (test_result_state(TOI_ABORTED)) + goto abort_reloading_pagedir_two; + + temp_result = write_image_header(); + + if (!temp_result && !test_result_state(TOI_ABORTED)) + return 0; + +abort_reloading_pagedir_two: + temp_result = read_pageset2(1); + + /* If that failed, we're sunk. Panic! */ + if (temp_result) + panic("Attempt to reload pagedir 2 while aborting " + "a hibernate failed."); + + return 1; +} + +static void map_ps2_pages(int enable) +{ + unsigned long pfn = 0; + + memory_bm_position_reset(pageset2_map); + pfn = memory_bm_next_pfn(pageset2_map, 0); + + while (pfn != BM_END_OF_MAP) { + struct page *page = pfn_to_page(pfn); + kernel_map_pages(page, 1, enable); + pfn = memory_bm_next_pfn(pageset2_map, 0); + } +} + +/** + * do_save_image - save the image and handle the result + * + * Save the prepared image. If we fail or we're in the path returning + * from the atomic restore, cleanup. + **/ +static int do_save_image(void) +{ + int result; + map_ps2_pages(0); + result = __save_image(); + map_ps2_pages(1); + return result; +} + +/** + * do_prepare_image - try to prepare an image + * + * Seek to initialise and prepare an image to be saved. On failure, + * cleanup. + **/ +static int do_prepare_image(void) +{ + int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); + + if (!restarting && toi_activate_storage(0)) + return 1; + + /* + * If kept image and still keeping image and hibernating to RAM, (non + * incremental image case) we will return 1 after hibernating and + * resuming (provided the power doesn't run out. In that case, we skip + * directly to cleaning up and exiting. + */ + + if (!can_hibernate() || + (test_result_state(TOI_KEPT_IMAGE) && + check_still_keeping_image())) + return 1; + + if (toi_init(restarting) || toi_prepare_image() || + test_result_state(TOI_ABORTED)) + return 1; + + trap_non_toi_io = 1; + + return 0; +} + +/** + * do_check_can_resume - find out whether an image has been stored + * + * Read whether an image exists. We use the same routine as the + * image_exists sysfs entry, and just look to see whether the + * first character in the resulting buffer is a '1'. + **/ +int do_check_can_resume(void) +{ + int result = -1; + + if (toi_activate_storage(0)) + return -1; + + if (!test_toi_state(TOI_RESUME_DEVICE_OK)) + toi_attempt_to_parse_resume_device(1); + + if (toiActiveAllocator) + result = toiActiveAllocator->image_exists(1); + + toi_deactivate_storage(0); + return result; +} + +/** + * do_load_atomic_copy - load the first part of an image, if it exists + * + * Check whether we have an image. If one exists, do sanity checking + * (possibly invalidating the image or even rebooting if the user + * requests that) before loading it into memory in preparation for the + * atomic restore. + * + * If and only if we have an image loaded and ready to restore, we return 1. + **/ +static int do_load_atomic_copy(void) +{ + int read_image_result = 0; + + if (sizeof(swp_entry_t) != sizeof(long)) { + printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size" + " of long. Please report this!\n"); + return 1; + } + + if (!resume_file[0]) + printk(KERN_WARNING "TuxOnIce: " + "You need to use a resume= command line parameter to " + "tell TuxOnIce where to look for an image.\n"); + + toi_activate_storage(0); + + if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) && + !toi_attempt_to_parse_resume_device(0)) { + /* + * Without a usable storage device we can do nothing - + * even if noresume is given + */ + + if (!toiNumAllocators) + printk(KERN_ALERT "TuxOnIce: " + "No storage allocators have been registered.\n"); + else + printk(KERN_ALERT "TuxOnIce: " + "Missing or invalid storage location " + "(resume= parameter). Please correct and " + "rerun lilo (or equivalent) before " + "hibernating.\n"); + toi_deactivate_storage(0); + return 1; + } + + if (allocate_bitmaps()) + return 1; + + read_image_result = read_pageset1(); /* non fatal error ignored */ + + if (test_toi_state(TOI_NORESUME_SPECIFIED)) + clear_toi_state(TOI_NORESUME_SPECIFIED); + + toi_deactivate_storage(0); + + if (read_image_result) + return 1; + + return 0; +} + +/** + * prepare_restore_load_alt_image - save & restore alt image variables + * + * Save and restore the pageset1 maps, when loading an alternate image. + **/ +static void prepare_restore_load_alt_image(int prepare) +{ + static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save; + + if (prepare) { + pageset1_map_save = pageset1_map; + pageset1_map = NULL; + pageset1_copy_map_save = pageset1_copy_map; + pageset1_copy_map = NULL; + set_toi_state(TOI_LOADING_ALT_IMAGE); + toi_reset_alt_image_pageset2_pfn(); + } else { + toi_free_bitmap(&pageset1_map); + pageset1_map = pageset1_map_save; + toi_free_bitmap(&pageset1_copy_map); + pageset1_copy_map = pageset1_copy_map_save; + clear_toi_state(TOI_NOW_RESUMING); + clear_toi_state(TOI_LOADING_ALT_IMAGE); + } +} + +/** + * do_toi_step - perform a step in hibernating or resuming + * + * Perform a step in hibernating or resuming an image. This abstraction + * is in preparation for implementing cluster support, and perhaps replacing + * uswsusp too (haven't looked whether that's possible yet). + **/ +int do_toi_step(int step) +{ + switch (step) { + case STEP_HIBERNATE_PREPARE_IMAGE: + return do_prepare_image(); + case STEP_HIBERNATE_SAVE_IMAGE: + return do_save_image(); + case STEP_HIBERNATE_POWERDOWN: + return do_post_image_write(); + case STEP_RESUME_CAN_RESUME: + return do_check_can_resume(); + case STEP_RESUME_LOAD_PS1: + return do_load_atomic_copy(); + case STEP_RESUME_DO_RESTORE: + /* + * If we succeed, this doesn't return. + * Instead, we return from do_save_image() in the + * hibernated kernel. + */ + return toi_atomic_restore(); + case STEP_RESUME_ALT_IMAGE: + printk(KERN_INFO "Trying to resume alternate image.\n"); + toi_in_hibernate = 0; + save_restore_alt_param(SAVE, NOQUIET); + prepare_restore_load_alt_image(1); + if (!do_check_can_resume()) { + printk(KERN_INFO "Nothing to resume from.\n"); + goto out; + } + if (!do_load_atomic_copy()) + toi_atomic_restore(); + + printk(KERN_INFO "Failed to load image.\n"); +out: + prepare_restore_load_alt_image(0); + save_restore_alt_param(RESTORE, NOQUIET); + break; + case STEP_CLEANUP: + do_cleanup(1, 0); + break; + case STEP_QUIET_CLEANUP: + do_cleanup(0, 0); + break; + } + + return 0; +} + +/* -- Functions for kickstarting a hibernate or resume --- */ + +/** + * toi_try_resume - try to do the steps in resuming + * + * Check if we have an image and if so try to resume. Clear the status + * flags too. + **/ +void toi_try_resume(void) +{ + set_toi_state(TOI_TRYING_TO_RESUME); + resume_attempted = 1; + + current->flags |= PF_MEMALLOC; + toi_start_other_threads(); + + if (do_toi_step(STEP_RESUME_CAN_RESUME) && + !do_toi_step(STEP_RESUME_LOAD_PS1)) + do_toi_step(STEP_RESUME_DO_RESTORE); + + toi_stop_other_threads(); + do_cleanup(0, 0); + + current->flags &= ~PF_MEMALLOC; + + clear_toi_state(TOI_IGNORE_LOGLEVEL); + clear_toi_state(TOI_TRYING_TO_RESUME); + clear_toi_state(TOI_NOW_RESUMING); +} + +/** + * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume + * + * Wrapper for when __toi_try_resume is called from swsusp resume path, + * rather than from echo > /sys/power/tuxonice/do_resume. + **/ +static void toi_sys_power_disk_try_resume(void) +{ + resume_attempted = 1; + + /* + * There's a comment in kernel/power/disk.c that indicates + * we should be able to use mutex_lock_nested below. That + * doesn't seem to cut it, though, so let's just turn lockdep + * off for now. + */ + lockdep_off(); + + if (toi_start_anything(SYSFS_RESUMING)) + goto out; + + toi_try_resume(); + + /* + * For initramfs, we have to clear the boot time + * flag after trying to resume + */ + clear_toi_state(TOI_BOOT_TIME); + + toi_finish_anything(SYSFS_RESUMING); +out: + lockdep_on(); +} + +/** + * toi_try_hibernate - try to start a hibernation cycle + * + * Start a hibernation cycle, coming in from either + * echo > /sys/power/tuxonice/do_suspend + * + * or + * + * echo disk > /sys/power/state + * + * In the later case, we come in without pm_sem taken; in the + * former, it has been taken. + **/ +int toi_try_hibernate(void) +{ + int result = 0, sys_power_disk = 0, retries = 0; + + if (!mutex_is_locked(&tuxonice_in_use)) { + /* Came in via /sys/power/disk */ + if (toi_start_anything(SYSFS_HIBERNATING)) + return -EBUSY; + sys_power_disk = 1; + } + + current->flags |= PF_MEMALLOC; + + if (test_toi_state(TOI_CLUSTER_MODE)) { + toi_initiate_cluster_hibernate(); + goto out; + } + +prepare: + result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); + + if (result) + goto out; + + if (test_action_state(TOI_FREEZER_TEST)) + goto out_restore_gfp_mask; + + result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); + + if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) { + if (retries < 2) { + do_cleanup(0, 1); + retries++; + clear_result_state(TOI_ABORTED); + extra_pd1_pages_allowance = extra_pd1_pages_used + 500; + printk(KERN_INFO "Automatically adjusting the extra" + " pages allowance to %ld and restarting.\n", + extra_pd1_pages_allowance); + pm_restore_gfp_mask(); + goto prepare; + } + + printk(KERN_INFO "Adjusted extra pages allowance twice and " + "still couldn't hibernate successfully. Giving up."); + } + + /* This code runs at resume time too! */ + if (!result && toi_in_hibernate) + result = do_toi_step(STEP_HIBERNATE_POWERDOWN); + +out_restore_gfp_mask: + pm_restore_gfp_mask(); +out: + do_cleanup(1, 0); + current->flags &= ~PF_MEMALLOC; + + if (sys_power_disk) + toi_finish_anything(SYSFS_HIBERNATING); + + return result; +} + +/* + * channel_no: If !0, -c is added to args (userui). + */ +int toi_launch_userspace_program(char *command, int channel_no, + int wait, int debug) +{ + int retval; + static char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL }; + static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL + }; + char *channel = NULL; + int arg = 0, size; + char test_read[255]; + char *orig_posn = command; + + if (!strlen(orig_posn)) + return 1; + + if (channel_no) { + channel = toi_kzalloc(4, 6, GFP_KERNEL); + if (!channel) { + printk(KERN_INFO "Failed to allocate memory in " + "preparing to launch userspace program.\n"); + return 1; + } + } + + /* Up to 6 args supported */ + while (arg < 6) { + sscanf(orig_posn, "%s", test_read); + size = strlen(test_read); + if (!(size)) + break; + argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP); + strcpy(argv[arg], test_read); + orig_posn += size + 1; + *test_read = 0; + arg++; + } + + if (channel_no) { + sprintf(channel, "-c%d", channel_no); + argv[arg] = channel; + } else + arg--; + + if (debug) { + argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP); + strcpy(argv[arg], "--debug"); + } + + retval = call_usermodehelper(argv[0], argv, envp, wait); + + /* + * If the program reports an error, retval = 256. Don't complain + * about that here. + */ + if (retval && retval != 256) + printk(KERN_ERR "Failed to launch userspace program '%s': " + "Error %d\n", command, retval); + + { + int i; + for (i = 0; i < arg; i++) + if (argv[i] && argv[i] != channel) + toi_kfree(5, argv[i], sizeof(*argv[i])); + } + + toi_kfree(4, channel, sizeof(*channel)); + + return retval; +} + +/* + * This array contains entries that are automatically registered at + * boot. Modules and the console code register their own entries separately. + */ +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_LONG("extra_pages_allowance", SYSFS_RW, + &extra_pd1_pages_allowance, 0, LONG_MAX, 0), + SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read, + image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL), + SYSFS_STRING("resume", SYSFS_RW, resume_file, 255, + SYSFS_NEEDS_SM_FOR_WRITE, + attempt_to_parse_resume_device2), + SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255, + SYSFS_NEEDS_SM_FOR_WRITE, + attempt_to_parse_alt_resume_param), + SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0, + NULL), + SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action, + TOI_IGNORE_ROOTFS, 0), + SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2, + INT_MAX, 0), + SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0), + SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action, + TOI_NO_MULTITHREADED_IO, 0), + SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action, + TOI_NO_FLUSHER_THREAD, 0), + SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action, + TOI_PAGESET2_FULL, 0), + SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0), + SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action, + TOI_REPLACE_SWSUSP, 0), + SYSFS_STRING("resume_commandline", SYSFS_RW, + toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0, + NULL), + SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL), + SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action, + TOI_FREEZER_TEST, 0), + SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0), + SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action, + TOI_TEST_FILTER_SPEED, 0), + SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action, + TOI_NO_PAGESET2, 0), + SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action, + TOI_NO_PS2_IF_UNNEEDED, 0), + SYSFS_STRING("binary_signature", SYSFS_READONLY, + tuxonice_signature, 9, 0, NULL), + SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0, + NULL), +#ifdef CONFIG_KGDB + SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action, + TOI_POST_RESUME_BREAKPOINT, 0), +#endif + SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action, + TOI_NO_READAHEAD, 0), + SYSFS_BIT("trace_debug_on", SYSFS_RW, &toi_bkd.toi_action, + TOI_TRACE_DEBUG_ON, 0), +#ifdef CONFIG_TOI_KEEP_IMAGE + SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE, + 0), +#endif +#ifdef CONFIG_TOI_INCREMENTAL + SYSFS_CUSTOM("pagestate", SYSFS_READONLY, get_toi_page_state, NULL, 0, + NULL), + SYSFS_BIT("incremental", SYSFS_RW, &toi_bkd.toi_action, + TOI_INCREMENTAL_IMAGE, 1), +#endif +}; + +static struct toi_core_fns my_fns = { + .get_nonconflicting_page = __toi_get_nonconflicting_page, + .post_context_save = __toi_post_context_save, + .try_hibernate = toi_try_hibernate, + .try_resume = toi_sys_power_disk_try_resume, +}; + +/** + * core_load - initialisation of TuxOnIce core + * + * Initialise the core, beginning with sysfs. Checksum and so on are part of + * the core, but have their own initialisation routines because they either + * aren't compiled in all the time or have their own subdirectories. + **/ +static __init int core_load(void) +{ + int i, + numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); + + printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION + " (http://tuxonice.net)\n"); + + if (!hibernation_available()) { + printk(KERN_INFO "TuxOnIce disabled due to request for hibernation" + " to be disabled in this kernel.\n"); + return 1; + } + + if (toi_sysfs_init()) + return 1; + + for (i = 0; i < numfiles; i++) + toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); + + toi_core_fns = &my_fns; + + if (toi_alloc_init()) + return 1; + if (toi_checksum_init()) + return 1; + if (toi_usm_init()) + return 1; + if (toi_ui_init()) + return 1; + if (toi_poweroff_init()) + return 1; + if (toi_cluster_init()) + return 1; + if (toi_cbw_init()) + return 1; + + return 0; +} + +late_initcall(core_load); diff -Nur linux-4.2/kernel/power/tuxonice_incremental.c linux-4.2-pck/kernel/power/tuxonice_incremental.c --- linux-4.2/kernel/power/tuxonice_incremental.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_incremental.c 2015-09-08 11:20:32.467011976 -0300 @@ -0,0 +1,402 @@ +/* + * kernel/power/tuxonice_incremental.c + * + * Copyright (C) 2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * This file contains routines related to storing incremental images - that + * is, retaining an image after an initial cycle and then storing incremental + * changes on subsequent hibernations. + * + * Based in part on on... + * + * Debug helper to dump the current kernel pagetables of the system + * so that we can see what the various memory ranges are set to. + * + * (C) Copyright 2008 Intel Corporation + * + * Author: Arjan van de Ven + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "tuxonice_pageflags.h" +#include "tuxonice_builtin.h" +#include "power.h" + +int toi_do_incremental_initcall; + +extern void kdb_init(int level); +extern noinline void kgdb_breakpoint(void); + +#undef pr_debug +#if 0 +#define pr_debug(a, b...) do { printk(a, ##b); } while(0) +#else +#define pr_debug(a, b...) do { } while(0) +#endif + +/* Multipliers for offsets within the PTEs */ +#define PTE_LEVEL_MULT (PAGE_SIZE) +#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) +#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) + +/* + * This function gets called on a break in a continuous series + * of PTE entries; the next one is different so we need to + * print what we collected so far. + */ +static void note_page(void *addr) +{ + static struct page *lastpage; + struct page *page; + + page = virt_to_page(addr); + + if (page != lastpage) { + unsigned int level; + pte_t *pte = lookup_address((unsigned long) addr, &level); + struct page *pt_page2 = pte_page(*pte); + //debug("Note page %p (=> %p => %p|%ld).\n", addr, pte, pt_page2, page_to_pfn(pt_page2)); + SetPageTOI_Untracked(pt_page2); + lastpage = page; + } +} + +static void walk_pte_level(pmd_t addr) +{ + int i; + pte_t *start; + + start = (pte_t *) pmd_page_vaddr(addr); + for (i = 0; i < PTRS_PER_PTE; i++) { + note_page(start); + start++; + } +} + +#if PTRS_PER_PMD > 1 + +static void walk_pmd_level(pud_t addr) +{ + int i; + pmd_t *start; + + start = (pmd_t *) pud_page_vaddr(addr); + for (i = 0; i < PTRS_PER_PMD; i++) { + if (!pmd_none(*start)) { + if (pmd_large(*start) || !pmd_present(*start)) + note_page(start); + else + walk_pte_level(*start); + } else + note_page(start); + start++; + } +} + +#else +#define walk_pmd_level(a) walk_pte_level(__pmd(pud_val(a))) +#define pud_large(a) pmd_large(__pmd(pud_val(a))) +#define pud_none(a) pmd_none(__pmd(pud_val(a))) +#endif + +#if PTRS_PER_PUD > 1 + +static void walk_pud_level(pgd_t addr) +{ + int i; + pud_t *start; + + start = (pud_t *) pgd_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_PUD; i++) { + if (!pud_none(*start)) { + if (pud_large(*start) || !pud_present(*start)) + note_page(start); + else + walk_pmd_level(*start); + } else + note_page(start); + + start++; + } +} + +#else +#define walk_pud_level(a) walk_pmd_level(__pud(pgd_val(a))) +#define pgd_large(a) pud_large(__pud(pgd_val(a))) +#define pgd_none(a) pud_none(__pud(pgd_val(a))) +#endif + +/* + * Not static in the original at the time of writing, so needs renaming here. + */ +static void toi_ptdump_walk_pgd_level(pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + pgd_t *start = (pgd_t *) &init_level4_pgt; +#else + pgd_t *start = swapper_pg_dir; +#endif + int i; + if (pgd) { + start = pgd; + } + + for (i = 0; i < PTRS_PER_PGD; i++) { + if (!pgd_none(*start)) { + if (pgd_large(*start) || !pgd_present(*start)) + note_page(start); + else + walk_pud_level(*start); + } else + note_page(start); + + start++; + } + + /* Flush out the last page */ + note_page(start); +} + +#ifdef CONFIG_PARAVIRT +extern struct pv_info pv_info; + +static void toi_set_paravirt_ops_untracked(void) { + int i; + + unsigned long pvpfn = page_to_pfn(virt_to_page(__parainstructions)), + pvpfn_end = page_to_pfn(virt_to_page(__parainstructions_end)); + //debug(KERN_EMERG ".parainstructions goes from pfn %ld to %ld.\n", pvpfn, pvpfn_end); + for (i = pvpfn; i <= pvpfn_end; i++) { + SetPageTOI_Untracked(pfn_to_page(i)); + } +} +#else +#define toi_set_paravirt_ops_untracked() { do { } while(0) } +#endif + +extern void toi_mark_per_cpus_pages_untracked(void); + +void toi_untrack_stack(unsigned long *stack) +{ + int i; + struct page *stack_page = virt_to_page(stack); + + for (i = 0; i < (1 << THREAD_SIZE_ORDER); i++) { + pr_debug("Untrack stack page %p.\n", page_address(stack_page + i)); + SetPageTOI_Untracked(stack_page + i); + } +} +void toi_untrack_process(struct task_struct *p) +{ + SetPageTOI_Untracked(virt_to_page(p)); + pr_debug("Untrack process %d page %p.\n", p->pid, page_address(virt_to_page(p))); + + toi_untrack_stack(p->stack); +} + +void toi_generate_untracked_map(void) +{ + struct task_struct *p, *t; + struct page *page; + pte_t *pte; + int i; + unsigned int level; + static int been_here = 0; + + if (been_here) + return; + + been_here = 1; + + /* Pagetable pages */ + toi_ptdump_walk_pgd_level(NULL); + + /* Printk buffer - not normally needed but can be helpful for debugging. */ + //toi_set_logbuf_untracked(); + + /* Paravirt ops */ + toi_set_paravirt_ops_untracked(); + + /* Task structs and stacks */ + for_each_process_thread(p, t) { + toi_untrack_process(p); + //toi_untrack_stack((unsigned long *) t->thread.sp); + } + + for (i = 0; i < NR_CPUS; i++) { + struct task_struct *idle = idle_task(i); + + if (idle) { + pr_debug("Untrack idle process for CPU %d.\n", i); + toi_untrack_process(idle); + } + + /* IRQ stack */ + pr_debug("Untrack IRQ stack for CPU %d.\n", i); + toi_untrack_stack((unsigned long *)per_cpu(irq_stack_ptr, i)); + } + + /* Per CPU data */ + //pr_debug("Untracking per CPU variable pages.\n"); + toi_mark_per_cpus_pages_untracked(); + + /* Init stack - for bringing up secondary CPUs */ + page = virt_to_page(init_stack); + for (i = 0; i < DIV_ROUND_UP(sizeof(init_stack), PAGE_SIZE); i++) { + SetPageTOI_Untracked(page + i); + } + + pte = lookup_address((unsigned long) &mmu_cr4_features, &level); + SetPageTOI_Untracked(pte_page(*pte)); + SetPageTOI_Untracked(virt_to_page(trampoline_cr4_features)); +} + +/** + * toi_reset_dirtiness_one + */ + +void toi_reset_dirtiness_one(unsigned long pfn, int verbose) +{ + struct page *page = pfn_to_page(pfn); + + /** + * Don't worry about whether the Dirty flag is + * already set. If this is our first call, it + * won't be. + */ + + preempt_disable(); + + ClearPageTOI_Dirty(page); + SetPageTOI_RO(page); + if (verbose) + printk(KERN_EMERG "Making page %ld (%p|%p) read only.\n", pfn, page, page_address(page)); + + set_memory_ro((unsigned long) page_address(page), 1); + + preempt_enable(); +} + +/** + * TuxOnIce's incremental image support works by marking all memory apart from + * the page tables read-only, then in the page-faults that result enabling + * writing if appropriate and flagging the page as dirty. Free pages are also + * marked as dirty and not protected so that if allocated, they will be included + * in the image without further processing. + * + * toi_reset_dirtiness is called when and image exists and incremental images are + * enabled, and each time we resume thereafter. It is not invoked on a fresh boot. + * + * This routine should be called from a single-cpu-running context to avoid races in setting + * page dirty/read only flags. + * + * TODO: Make "it is not invoked on a fresh boot" true when I've finished developing it! + * + * TODO: Consider Xen paravirt guest boot issues. See arch/x86/mm/pageattr.c. + **/ + +int toi_reset_dirtiness(int verbose) +{ + struct zone *zone; + unsigned long loop; + int allocated_map = 0; + + toi_generate_untracked_map(); + + if (!free_map) { + if (!toi_alloc_bitmap(&free_map)) + return -ENOMEM; + allocated_map = 1; + } + + toi_generate_free_page_map(); + + pr_debug(KERN_EMERG "Reset dirtiness.\n"); + for_each_populated_zone(zone) { + // 64 bit only. No need to worry about highmem. + for (loop = 0; loop < zone->spanned_pages; loop++) { + unsigned long pfn = zone->zone_start_pfn + loop; + struct page *page; + int chunk_size; + + if (!pfn_valid(pfn)) { + continue; + } + + chunk_size = toi_size_of_free_region(zone, pfn); + if (chunk_size) { + loop += chunk_size - 1; + continue; + } + + page = pfn_to_page(pfn); + + if (PageNosave(page) || !saveable_page(zone, pfn)) { + continue; + } + + if (PageTOI_Untracked(page)) { + continue; + } + + /** + * Do we need to (re)protect the page? + * If it is already protected (PageTOI_RO), there is + * nothing to do - skip the following. + * If it is marked as dirty (PageTOI_Dirty), it was + * either free and has been allocated or has been + * written to and marked dirty. Reset the dirty flag + * and (re)apply the protection. + */ + if (!PageTOI_RO(page)) { + toi_reset_dirtiness_one(pfn, verbose); + } + } + } + + pr_debug(KERN_EMERG "Done resetting dirtiness.\n"); + + if (allocated_map) { + toi_free_bitmap(&free_map); + } + return 0; +} + +static int toi_reset_dirtiness_initcall(void) +{ + if (toi_do_incremental_initcall) { + pr_info("TuxOnIce: Enabling dirty page tracking.\n"); + toi_reset_dirtiness(0); + } + return 1; +} +extern void toi_generate_untracked_map(void); + +// Leave early_initcall for pages to register untracked sections. +early_initcall(toi_reset_dirtiness_initcall); + +static int __init toi_incremental_initcall_setup(char *str) +{ + int value; + + if (sscanf(str, "=%d", &value) && value) + toi_do_incremental_initcall = value; + + return 1; +} +__setup("toi_incremental_initcall", toi_incremental_initcall_setup); diff -Nur linux-4.2/kernel/power/tuxonice_io.c linux-4.2-pck/kernel/power/tuxonice_io.c --- linux-4.2/kernel/power/tuxonice_io.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_io.c 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,1932 @@ +/* + * kernel/power/tuxonice_io.c + * + * Copyright (C) 1998-2001 Gabor Kuti + * Copyright (C) 1998,2001,2002 Pavel Machek + * Copyright (C) 2002-2003 Florent Chabaud + * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * It contains high level IO routines for hibernating. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tuxonice.h" +#include "tuxonice_modules.h" +#include "tuxonice_pageflags.h" +#include "tuxonice_io.h" +#include "tuxonice_ui.h" +#include "tuxonice_storage.h" +#include "tuxonice_prepare_image.h" +#include "tuxonice_extent.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_builtin.h" +#include "tuxonice_checksum.h" +#include "tuxonice_alloc.h" +char alt_resume_param[256]; + +/* Version read from image header at resume */ +static int toi_image_header_version; + +#define read_if_version(VERS, VAR, DESC, ERR_ACT) do { \ + if (likely(toi_image_header_version >= VERS)) \ + if (toiActiveAllocator->rw_header_chunk(READ, NULL, \ + (char *) &VAR, sizeof(VAR))) { \ + abort_hibernate(TOI_FAILED_IO, "Failed to read DESC."); \ + ERR_ACT; \ + } \ +} while(0) \ + +/* Variables shared between threads and updated under the mutex */ +static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result; +static int io_index, io_nextupdate, io_pc, io_pc_step; +static DEFINE_MUTEX(io_mutex); +static DEFINE_PER_CPU(struct page *, last_sought); +static DEFINE_PER_CPU(struct page *, last_high_page); +static DEFINE_PER_CPU(char *, checksum_locn); +static DEFINE_PER_CPU(struct pbe *, last_low_page); +static atomic_t io_count; +atomic_t toi_io_workers; + +static int using_flusher; + +DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher); + +int toi_bio_queue_flusher_should_finish; + +int toi_max_workers; + +static char *image_version_error = "The image header version is newer than " \ + "this kernel supports."; + +struct toi_module_ops *first_filter; + +static atomic_t toi_num_other_threads; +static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue); +enum toi_worker_commands { + TOI_IO_WORKER_STOP, + TOI_IO_WORKER_RUN, + TOI_IO_WORKER_EXIT +}; +static enum toi_worker_commands toi_worker_command; + +/** + * toi_attempt_to_parse_resume_device - determine if we can hibernate + * + * Can we hibernate, using the current resume= parameter? + **/ +int toi_attempt_to_parse_resume_device(int quiet) +{ + struct list_head *Allocator; + struct toi_module_ops *thisAllocator; + int result, returning = 0; + + if (toi_activate_storage(0)) + return 0; + + toiActiveAllocator = NULL; + clear_toi_state(TOI_RESUME_DEVICE_OK); + clear_toi_state(TOI_CAN_RESUME); + clear_result_state(TOI_ABORTED); + + if (!toiNumAllocators) { + if (!quiet) + printk(KERN_INFO "TuxOnIce: No storage allocators have " + "been registered. Hibernating will be " + "disabled.\n"); + goto cleanup; + } + + list_for_each(Allocator, &toiAllocators) { + thisAllocator = list_entry(Allocator, struct toi_module_ops, + type_list); + + /* + * Not sure why you'd want to disable an allocator, but + * we should honour the flag if we're providing it + */ + if (!thisAllocator->enabled) + continue; + + result = thisAllocator->parse_sig_location( + resume_file, (toiNumAllocators == 1), + quiet); + + switch (result) { + case -EINVAL: + /* For this allocator, but not a valid + * configuration. Error already printed. */ + goto cleanup; + + case 0: + /* For this allocator and valid. */ + toiActiveAllocator = thisAllocator; + + set_toi_state(TOI_RESUME_DEVICE_OK); + set_toi_state(TOI_CAN_RESUME); + returning = 1; + goto cleanup; + } + } + if (!quiet) + printk(KERN_INFO "TuxOnIce: No matching enabled allocator " + "found. Resuming disabled.\n"); +cleanup: + toi_deactivate_storage(0); + return returning; +} + +void attempt_to_parse_resume_device2(void) +{ + toi_prepare_usm(); + toi_attempt_to_parse_resume_device(0); + toi_cleanup_usm(); +} + +void save_restore_alt_param(int replace, int quiet) +{ + static char resume_param_save[255]; + static unsigned long toi_state_save; + + if (replace) { + toi_state_save = toi_state; + strcpy(resume_param_save, resume_file); + strcpy(resume_file, alt_resume_param); + } else { + strcpy(resume_file, resume_param_save); + toi_state = toi_state_save; + } + toi_attempt_to_parse_resume_device(quiet); +} + +void attempt_to_parse_alt_resume_param(void) +{ + int ok = 0; + + /* Temporarily set resume_param to the poweroff value */ + if (!strlen(alt_resume_param)) + return; + + printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n"); + save_restore_alt_param(SAVE, NOQUIET); + if (test_toi_state(TOI_CAN_RESUME)) + ok = 1; + + printk(KERN_INFO "=== Done ===\n"); + save_restore_alt_param(RESTORE, QUIET); + + /* If not ok, clear the string */ + if (ok) + return; + + printk(KERN_INFO "Can't resume from that location; clearing " + "alt_resume_param.\n"); + alt_resume_param[0] = '\0'; +} + +/** + * noresume_reset_modules - reset data structures in case of non resuming + * + * When we read the start of an image, modules (and especially the + * active allocator) might need to reset data structures if we + * decide to remove the image rather than resuming from it. + **/ +static void noresume_reset_modules(void) +{ + struct toi_module_ops *this_filter; + + list_for_each_entry(this_filter, &toi_filters, type_list) + if (this_filter->noresume_reset) + this_filter->noresume_reset(); + + if (toiActiveAllocator && toiActiveAllocator->noresume_reset) + toiActiveAllocator->noresume_reset(); +} + +/** + * fill_toi_header - fill the hibernate header structure + * @struct toi_header: Header data structure to be filled. + **/ +static int fill_toi_header(struct toi_header *sh) +{ + int i, error; + + error = init_header((struct swsusp_info *) sh); + if (error) + return error; + + sh->pagedir = pagedir1; + sh->pageset_2_size = pagedir2.size; + sh->param0 = toi_result; + sh->param1 = toi_bkd.toi_action; + sh->param2 = toi_bkd.toi_debug_state; + sh->param3 = toi_bkd.toi_default_console_level; + sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev; + for (i = 0; i < 4; i++) + sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2]; + sh->bkd = boot_kernel_data_buffer; + return 0; +} + +/** + * rw_init_modules - initialize modules + * @rw: Whether we are reading of writing an image. + * @which: Section of the image being processed. + * + * Iterate over modules, preparing the ones that will be used to read or write + * data. + **/ +static int rw_init_modules(int rw, int which) +{ + struct toi_module_ops *this_module; + /* Initialise page transformers */ + list_for_each_entry(this_module, &toi_filters, type_list) { + if (!this_module->enabled) + continue; + if (this_module->rw_init && this_module->rw_init(rw, which)) { + abort_hibernate(TOI_FAILED_MODULE_INIT, + "Failed to initialize the %s filter.", + this_module->name); + return 1; + } + } + + /* Initialise allocator */ + if (toiActiveAllocator->rw_init(rw, which)) { + abort_hibernate(TOI_FAILED_MODULE_INIT, + "Failed to initialise the allocator."); + return 1; + } + + /* Initialise other modules */ + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!this_module->enabled || + this_module->type == FILTER_MODULE || + this_module->type == WRITER_MODULE) + continue; + if (this_module->rw_init && this_module->rw_init(rw, which)) { + set_abort_result(TOI_FAILED_MODULE_INIT); + printk(KERN_INFO "Setting aborted flag due to module " + "init failure.\n"); + return 1; + } + } + + return 0; +} + +/** + * rw_cleanup_modules - cleanup modules + * @rw: Whether we are reading of writing an image. + * + * Cleanup components after reading or writing a set of pages. + * Only the allocator may fail. + **/ +static int rw_cleanup_modules(int rw) +{ + struct toi_module_ops *this_module; + int result = 0; + + /* Cleanup other modules */ + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!this_module->enabled || + this_module->type == FILTER_MODULE || + this_module->type == WRITER_MODULE) + continue; + if (this_module->rw_cleanup) + result |= this_module->rw_cleanup(rw); + } + + /* Flush data and cleanup */ + list_for_each_entry(this_module, &toi_filters, type_list) { + if (!this_module->enabled) + continue; + if (this_module->rw_cleanup) + result |= this_module->rw_cleanup(rw); + } + + result |= toiActiveAllocator->rw_cleanup(rw); + + return result; +} + +static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high) +{ + int index, min, max; + struct page *high_page = NULL, + **my_last_high_page = raw_cpu_ptr(&last_high_page), + **my_last_sought = raw_cpu_ptr(&last_sought); + struct pbe *this, **my_last_low_page = raw_cpu_ptr(&last_low_page); + void *compare; + + if (is_high) { + if (*my_last_sought && *my_last_high_page && + *my_last_sought < orig_page) + high_page = *my_last_high_page; + else + high_page = (struct page *) restore_highmem_pblist; + this = (struct pbe *) kmap(high_page); + compare = orig_page; + } else { + if (*my_last_sought && *my_last_low_page && + *my_last_sought < orig_page) + this = *my_last_low_page; + else + this = restore_pblist; + compare = page_address(orig_page); + } + + *my_last_sought = orig_page; + + /* Locate page containing pbe */ + while (this[PBES_PER_PAGE - 1].next && + this[PBES_PER_PAGE - 1].orig_address < compare) { + if (is_high) { + struct page *next_high_page = (struct page *) + this[PBES_PER_PAGE - 1].next; + kunmap(high_page); + this = kmap(next_high_page); + high_page = next_high_page; + } else + this = this[PBES_PER_PAGE - 1].next; + } + + /* Do a binary search within the page */ + min = 0; + max = PBES_PER_PAGE; + index = PBES_PER_PAGE / 2; + while (max - min) { + if (!this[index].orig_address || + this[index].orig_address > compare) + max = index; + else if (this[index].orig_address == compare) { + if (is_high) { + struct page *page = this[index].address; + *my_last_high_page = high_page; + kunmap(high_page); + return page; + } + *my_last_low_page = this; + return virt_to_page(this[index].address); + } else + min = index; + index = ((max + min) / 2); + }; + + if (is_high) + kunmap(high_page); + + abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for" + " orig page %p. This[min].orig_address=%p.\n", orig_page, + this[index].orig_address); + return NULL; +} + +/** + * write_next_page - write the next page in a pageset + * @data_pfn: The pfn where the next data to write is located. + * @my_io_index: The index of the page in the pageset. + * @write_pfn: The pfn number to write in the image (where the data belongs). + * + * Get the pfn of the next page to write, map the page if necessary and do the + * write. + **/ +static int write_next_page(unsigned long *data_pfn, int *my_io_index, + unsigned long *write_pfn) +{ + struct page *page; + char **my_checksum_locn = raw_cpu_ptr(&checksum_locn); + int result = 0, was_present; + + *data_pfn = memory_bm_next_pfn(io_map, 0); + + /* Another thread could have beaten us to it. */ + if (*data_pfn == BM_END_OF_MAP) { + if (atomic_read(&io_count)) { + printk(KERN_INFO "Ran out of pfns but io_count is " + "still %d.\n", atomic_read(&io_count)); + BUG(); + } + mutex_unlock(&io_mutex); + return -ENODATA; + } + + *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); + + memory_bm_clear_bit(io_map, 0, *data_pfn); + page = pfn_to_page(*data_pfn); + + was_present = kernel_page_present(page); + if (!was_present) + kernel_map_pages(page, 1, 1); + + if (io_pageset == 1) + *write_pfn = memory_bm_next_pfn(pageset1_map, 0); + else { + *write_pfn = *data_pfn; + *my_checksum_locn = tuxonice_get_next_checksum(); + } + + TOI_TRACE_DEBUG(*data_pfn, "_PS%d_write %d", io_pageset, *my_io_index); + + mutex_unlock(&io_mutex); + + if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn)) + return 1; + + result = first_filter->write_page(*write_pfn, TOI_PAGE, page, + PAGE_SIZE); + + if (!was_present) + kernel_map_pages(page, 1, 0); + + return result; +} + +/** + * read_next_page - read the next page in a pageset + * @my_io_index: The index of the page in the pageset. + * @write_pfn: The pfn in which the data belongs. + * + * Read a page of the image into our buffer. It can happen (here and in the + * write routine) that threads don't get run until after other CPUs have done + * all the work. This was the cause of the long standing issue with + * occasionally getting -ENODATA errors at the end of reading the image. We + * therefore need to check there's actually a page to read before trying to + * retrieve one. + **/ + +static int read_next_page(int *my_io_index, unsigned long *write_pfn, + struct page *buffer) +{ + unsigned int buf_size = PAGE_SIZE; + unsigned long left = atomic_read(&io_count); + + if (!left) + return -ENODATA; + + /* Start off assuming the page we read isn't resaved */ + *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); + + mutex_unlock(&io_mutex); + + /* + * Are we aborting? If so, don't submit any more I/O as + * resetting the resume_attempted flag (from ui.c) will + * clear the bdev flags, making this thread oops. + */ + if (unlikely(test_toi_state(TOI_STOP_RESUME))) { + atomic_dec(&toi_io_workers); + if (!atomic_read(&toi_io_workers)) { + /* + * So we can be sure we'll have memory for + * marking that we haven't resumed. + */ + rw_cleanup_modules(READ); + set_toi_state(TOI_IO_STOPPED); + } + while (1) + schedule(); + } + + /* + * See toi_bio_read_page in tuxonice_bio.c: + * read the next page in the image. + */ + return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size); +} + +static void use_read_page(unsigned long write_pfn, struct page *buffer) +{ + struct page *final_page = pfn_to_page(write_pfn), + *copy_page = final_page; + char *virt, *buffer_virt; + int was_present, cpu = smp_processor_id(); + unsigned long idx = 0; + + if (io_pageset == 1 && (!pageset1_copy_map || + !memory_bm_test_bit(pageset1_copy_map, cpu, write_pfn))) { + int is_high = PageHighMem(final_page); + copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high); + } + + if (!memory_bm_test_bit(io_map, cpu, write_pfn)) { + int test = !memory_bm_test_bit(io_map, cpu, write_pfn); + toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld (%d).", write_pfn, test); + mutex_lock(&io_mutex); + idx = atomic_add_return(1, &io_count); + mutex_unlock(&io_mutex); + return; + } + + virt = kmap(copy_page); + buffer_virt = kmap(buffer); + was_present = kernel_page_present(copy_page); + if (!was_present) + kernel_map_pages(copy_page, 1, 1); + memcpy(virt, buffer_virt, PAGE_SIZE); + if (!was_present) + kernel_map_pages(copy_page, 1, 0); + kunmap(copy_page); + kunmap(buffer); + memory_bm_clear_bit(io_map, cpu, write_pfn); + TOI_TRACE_DEBUG(write_pfn, "_PS%d_read", io_pageset); +} + +static unsigned long status_update(int writing, unsigned long done, + unsigned long ticks) +{ + int cs_index = writing ? 0 : 1; + unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks; + unsigned long msec = jiffies_to_msecs(abs(ticks_so_far)); + unsigned long pgs_per_s, estimate = 0, pages_left; + + if (msec) { + pages_left = io_barmax - done; + pgs_per_s = 1000 * done / msec; + if (pgs_per_s) + estimate = DIV_ROUND_UP(pages_left, pgs_per_s); + } + + if (estimate && ticks > HZ / 2) + return toi_update_status(done, io_barmax, + " %d/%d MB (%lu sec left)", + MB(done+1), MB(io_barmax), estimate); + + return toi_update_status(done, io_barmax, " %d/%d MB", + MB(done+1), MB(io_barmax)); +} + +/** + * worker_rw_loop - main loop to read/write pages + * + * The main I/O loop for reading or writing pages. The io_map bitmap is used to + * track the pages to read/write. + * If we are reading, the pages are loaded to their final (mapped) pfn. + * Data is non zero iff this is a thread started via start_other_threads. + * In that case, we stay in here until told to quit. + **/ +static int worker_rw_loop(void *data) +{ + unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4, + jif_index = 1, start_time = jiffies, thread_num; + int result = 0, my_io_index = 0, last_worker; + struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP); + cpumask_var_t orig_mask; + + if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) { + printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data); + result = -ENOMEM; + goto out; + } + + cpumask_copy(orig_mask, tsk_cpus_allowed(current)); + + current->flags |= PF_NOFREEZE; + +top: + mutex_lock(&io_mutex); + thread_num = atomic_read(&toi_io_workers); + + cpumask_copy(tsk_cpus_allowed(current), orig_mask); + schedule(); + + atomic_inc(&toi_io_workers); + + while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) && + !(io_write && test_result_state(TOI_ABORTED)) && + toi_worker_command == TOI_IO_WORKER_RUN) { + if (!thread_num && jiffies > next_jiffies) { + next_jiffies += HZ / 4; + if (toiActiveAllocator->update_throughput_throttle) + toiActiveAllocator->update_throughput_throttle( + jif_index); + jif_index++; + } + + /* + * What page to use? If reading, don't know yet which page's + * data will be read, so always use the buffer. If writing, + * use the copy (Pageset1) or original page (Pageset2), but + * always write the pfn of the original page. + */ + if (io_write) + result = write_next_page(&data_pfn, &my_io_index, + &write_pfn); + else /* Reading */ + result = read_next_page(&my_io_index, &write_pfn, + buffer); + + if (result) { + mutex_lock(&io_mutex); + /* Nothing to do? */ + if (result == -ENODATA) { + toi_message(TOI_IO, TOI_VERBOSE, 0, + "Thread %d has no more work.", + smp_processor_id()); + break; + } + + io_result = result; + + if (io_write) { + printk(KERN_INFO "Write chunk returned %d.\n", + result); + abort_hibernate(TOI_FAILED_IO, + "Failed to write a chunk of the " + "image."); + break; + } + + if (io_pageset == 1) { + printk(KERN_ERR "\nBreaking out of I/O loop " + "because of result code %d.\n", result); + break; + } + panic("Read chunk returned (%d)", result); + } + + /* + * Discard reads of resaved pages while reading ps2 + * and unwanted pages while rereading ps2 when aborting. + */ + if (!io_write) { + if (!PageResave(pfn_to_page(write_pfn))) + use_read_page(write_pfn, buffer); + else { + mutex_lock(&io_mutex); + toi_message(TOI_IO, TOI_VERBOSE, 0, + "Resaved %ld.", write_pfn); + atomic_inc(&io_count); + mutex_unlock(&io_mutex); + } + } + + if (!thread_num) { + if(my_io_index + io_base > io_nextupdate) + io_nextupdate = status_update(io_write, + my_io_index + io_base, + jiffies - start_time); + + if (my_io_index > io_pc) { + printk(KERN_CONT "...%d%%", 20 * io_pc_step); + io_pc_step++; + io_pc = io_finish_at * io_pc_step / 5; + } + } + + toi_cond_pause(0, NULL); + + /* + * Subtle: If there's less I/O still to be done than threads + * running, quit. This stops us doing I/O beyond the end of + * the image when reading. + * + * Possible race condition. Two threads could do the test at + * the same time; one should exit and one should continue. + * Therefore we take the mutex before comparing and exiting. + */ + + mutex_lock(&io_mutex); + } + + last_worker = atomic_dec_and_test(&toi_io_workers); + toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers)); + mutex_unlock(&io_mutex); + + if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) { + /* Were we the last thread and we're using a flusher thread? */ + if (last_worker && using_flusher) { + toiActiveAllocator->finish_all_io(); + } + /* First, if we're doing I/O, wait for it to finish */ + wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN); + /* Then wait to be told what to do next */ + wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP); + if (toi_worker_command == TOI_IO_WORKER_RUN) + goto top; + } + + if (thread_num) + atomic_dec(&toi_num_other_threads); + +out: + toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num); + toi__free_page(28, buffer); + free_cpumask_var(orig_mask); + + return result; +} + +int toi_start_other_threads(void) +{ + int cpu; + struct task_struct *p; + int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1; + unsigned long num_started = 0; + + if (test_action_state(TOI_NO_MULTITHREADED_IO)) + return 0; + + toi_worker_command = TOI_IO_WORKER_STOP; + + for_each_online_cpu(cpu) { + if (num_started == to_start) + break; + + if (cpu == smp_processor_id()) + continue; + + p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1, + cpu_to_node(cpu), "ktoi_io/%d", cpu); + if (IS_ERR(p)) { + printk(KERN_ERR "ktoi_io for %i failed\n", cpu); + continue; + } + kthread_bind(p, cpu); + p->flags |= PF_MEMALLOC; + wake_up_process(p); + num_started++; + atomic_inc(&toi_num_other_threads); + } + + toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started); + return num_started; +} + +void toi_stop_other_threads(void) +{ + toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads."); + toi_worker_command = TOI_IO_WORKER_EXIT; + wake_up(&toi_worker_wait_queue); +} + +/** + * do_rw_loop - main highlevel function for reading or writing pages + * + * Create the io_map bitmap and call worker_rw_loop to perform I/O operations. + **/ +static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags, + int base, int barmax, int pageset) +{ + int index = 0, cpu, result = 0, workers_started; + unsigned long pfn, next; + + first_filter = toi_get_next_filter(NULL); + + if (!finish_at) + return 0; + + io_write = write; + io_finish_at = finish_at; + io_base = base; + io_barmax = barmax; + io_pageset = pageset; + io_index = 0; + io_pc = io_finish_at / 5; + io_pc_step = 1; + io_result = 0; + io_nextupdate = base + 1; + toi_bio_queue_flusher_should_finish = 0; + + for_each_online_cpu(cpu) { + per_cpu(last_sought, cpu) = NULL; + per_cpu(last_low_page, cpu) = NULL; + per_cpu(last_high_page, cpu) = NULL; + } + + /* Ensure all bits clear */ + memory_bm_clear(io_map); + + memory_bm_position_reset(io_map); + next = memory_bm_next_pfn(io_map, 0); + + BUG_ON(next != BM_END_OF_MAP); + + /* Set the bits for the pages to write */ + memory_bm_position_reset(pageflags); + + pfn = memory_bm_next_pfn(pageflags, 0); + toi_trace_index++; + + while (pfn != BM_END_OF_MAP && index < finish_at) { + TOI_TRACE_DEBUG(pfn, "_io_pageset_%d (%d/%d)", pageset, index + 1, finish_at); + memory_bm_set_bit(io_map, 0, pfn); + pfn = memory_bm_next_pfn(pageflags, 0); + index++; + } + + BUG_ON(next != BM_END_OF_MAP || index < finish_at); + + memory_bm_position_reset(io_map); + toi_trace_index++; + + atomic_set(&io_count, finish_at); + + memory_bm_position_reset(pageset1_map); + + mutex_lock(&io_mutex); + + clear_toi_state(TOI_IO_STOPPED); + + using_flusher = (atomic_read(&toi_num_other_threads) && + toiActiveAllocator->io_flusher && + !test_action_state(TOI_NO_FLUSHER_THREAD)); + + workers_started = atomic_read(&toi_num_other_threads); + + memory_bm_position_reset(io_map); + memory_bm_position_reset(pageset1_copy_map); + + toi_worker_command = TOI_IO_WORKER_RUN; + wake_up(&toi_worker_wait_queue); + + mutex_unlock(&io_mutex); + + if (using_flusher) + result = toiActiveAllocator->io_flusher(write); + else + worker_rw_loop(NULL); + + while (atomic_read(&toi_io_workers)) + schedule(); + + printk(KERN_CONT "\n"); + + toi_worker_command = TOI_IO_WORKER_STOP; + wake_up(&toi_worker_wait_queue); + + if (unlikely(test_toi_state(TOI_STOP_RESUME))) { + if (!atomic_read(&toi_io_workers)) { + rw_cleanup_modules(READ); + set_toi_state(TOI_IO_STOPPED); + } + while (1) + schedule(); + } + set_toi_state(TOI_IO_STOPPED); + + if (!io_result && !result && !test_result_state(TOI_ABORTED)) { + unsigned long next; + + toi_update_status(io_base + io_finish_at, io_barmax, + " %d/%d MB ", + MB(io_base + io_finish_at), MB(io_barmax)); + + memory_bm_position_reset(io_map); + next = memory_bm_next_pfn(io_map, 0); + if (next != BM_END_OF_MAP) { + printk(KERN_INFO "Finished I/O loop but still work to " + "do?\nFinish at = %d. io_count = %d.\n", + finish_at, atomic_read(&io_count)); + printk(KERN_INFO "I/O bitmap still records work to do." + "%ld.\n", next); + BUG(); + do { + cpu_relax(); + } while (0); + } + } + + return io_result ? io_result : result; +} + +/** + * write_pageset - write a pageset to disk. + * @pagedir: Which pagedir to write. + * + * Returns: + * Zero on success or -1 on failure. + **/ +int write_pageset(struct pagedir *pagedir) +{ + int finish_at, base = 0; + int barmax = pagedir1.size + pagedir2.size; + long error = 0; + struct memory_bitmap *pageflags; + unsigned long start_time, end_time; + + /* + * Even if there is nothing to read or write, the allocator + * may need the init/cleanup for it's housekeeping. (eg: + * Pageset1 may start where pageset2 ends when writing). + */ + finish_at = pagedir->size; + + if (pagedir->id == 1) { + toi_prepare_status(DONT_CLEAR_BAR, + "Writing kernel & process data..."); + base = pagedir2.size; + if (test_action_state(TOI_TEST_FILTER_SPEED) || + test_action_state(TOI_TEST_BIO)) + pageflags = pageset1_map; + else + pageflags = pageset1_copy_map; + } else { + toi_prepare_status(DONT_CLEAR_BAR, "Writing caches..."); + pageflags = pageset2_map; + } + + start_time = jiffies; + + if (rw_init_modules(WRITE, pagedir->id)) { + abort_hibernate(TOI_FAILED_MODULE_INIT, + "Failed to initialise modules for writing."); + error = 1; + } + + if (!error) + error = do_rw_loop(WRITE, finish_at, pageflags, base, barmax, + pagedir->id); + + if (rw_cleanup_modules(WRITE) && !error) { + abort_hibernate(TOI_FAILED_MODULE_CLEANUP, + "Failed to cleanup after writing."); + error = 1; + } + + end_time = jiffies; + + if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { + toi_bkd.toi_io_time[0][0] += finish_at, + toi_bkd.toi_io_time[0][1] += (end_time - start_time); + } + + return error; +} + +/** + * read_pageset - highlevel function to read a pageset from disk + * @pagedir: pageset to read + * @overwrittenpagesonly: Whether to read the whole pageset or + * only part of it. + * + * Returns: + * Zero on success or -1 on failure. + **/ +static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly) +{ + int result = 0, base = 0; + int finish_at = pagedir->size; + int barmax = pagedir1.size + pagedir2.size; + struct memory_bitmap *pageflags; + unsigned long start_time, end_time; + + if (pagedir->id == 1) { + toi_prepare_status(DONT_CLEAR_BAR, + "Reading kernel & process data..."); + pageflags = pageset1_map; + } else { + toi_prepare_status(DONT_CLEAR_BAR, "Reading caches..."); + if (overwrittenpagesonly) { + barmax = min(pagedir1.size, pagedir2.size); + finish_at = min(pagedir1.size, pagedir2.size); + } else + base = pagedir1.size; + pageflags = pageset2_map; + } + + start_time = jiffies; + + if (rw_init_modules(READ, pagedir->id)) { + toiActiveAllocator->remove_image(); + result = 1; + } else + result = do_rw_loop(READ, finish_at, pageflags, base, barmax, + pagedir->id); + + if (rw_cleanup_modules(READ) && !result) { + abort_hibernate(TOI_FAILED_MODULE_CLEANUP, + "Failed to cleanup after reading."); + result = 1; + } + + /* Statistics */ + end_time = jiffies; + + if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { + toi_bkd.toi_io_time[1][0] += finish_at, + toi_bkd.toi_io_time[1][1] += (end_time - start_time); + } + + return result; +} + +/** + * write_module_configs - store the modules configuration + * + * The configuration for each module is stored in the image header. + * Returns: Int + * Zero on success, Error value otherwise. + **/ +static int write_module_configs(void) +{ + struct toi_module_ops *this_module; + char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP); + int len, index = 1; + struct toi_module_header toi_module_header; + + if (!buffer) { + printk(KERN_INFO "Failed to allocate a buffer for saving " + "module configuration info.\n"); + return -ENOMEM; + } + + /* + * We have to know which data goes with which module, so we at + * least write a length of zero for a module. Note that we are + * also assuming every module's config data takes <= PAGE_SIZE. + */ + + /* For each module (in registration order) */ + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!this_module->enabled || !this_module->storage_needed || + (this_module->type == WRITER_MODULE && + toiActiveAllocator != this_module)) + continue; + + /* Get the data from the module */ + len = 0; + if (this_module->save_config_info) + len = this_module->save_config_info(buffer); + + /* Save the details of the module */ + toi_module_header.enabled = this_module->enabled; + toi_module_header.type = this_module->type; + toi_module_header.index = index++; + strncpy(toi_module_header.name, this_module->name, + sizeof(toi_module_header.name)); + toiActiveAllocator->rw_header_chunk(WRITE, + this_module, + (char *) &toi_module_header, + sizeof(toi_module_header)); + + /* Save the size of the data and any data returned */ + toiActiveAllocator->rw_header_chunk(WRITE, + this_module, + (char *) &len, sizeof(int)); + if (len) + toiActiveAllocator->rw_header_chunk( + WRITE, this_module, buffer, len); + } + + /* Write a blank header to terminate the list */ + toi_module_header.name[0] = '\0'; + toiActiveAllocator->rw_header_chunk(WRITE, NULL, + (char *) &toi_module_header, sizeof(toi_module_header)); + + toi_free_page(22, (unsigned long) buffer); + return 0; +} + +/** + * read_one_module_config - read and configure one module + * + * Read the configuration for one module, and configure the module + * to match if it is loaded. + * + * Returns: Int + * Zero on success, Error value otherwise. + **/ +static int read_one_module_config(struct toi_module_header *header) +{ + struct toi_module_ops *this_module; + int result, len; + char *buffer; + + /* Find the module */ + this_module = toi_find_module_given_name(header->name); + + if (!this_module) { + if (header->enabled) { + toi_early_boot_message(1, TOI_CONTINUE_REQ, + "It looks like we need module %s for reading " + "the image but it hasn't been registered.\n", + header->name); + if (!(test_toi_state(TOI_CONTINUE_REQ))) + return -EINVAL; + } else + printk(KERN_INFO "Module %s configuration data found, " + "but the module hasn't registered. Looks like " + "it was disabled, so we're ignoring its data.", + header->name); + } + + /* Get the length of the data (if any) */ + result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len, + sizeof(int)); + if (result) { + printk(KERN_ERR "Failed to read the length of the module %s's" + " configuration data.\n", + header->name); + return -EINVAL; + } + + /* Read any data and pass to the module (if we found one) */ + if (!len) + return 0; + + buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP); + + if (!buffer) { + printk(KERN_ERR "Failed to allocate a buffer for reloading " + "module configuration info.\n"); + return -ENOMEM; + } + + toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len); + + if (!this_module) + goto out; + + if (!this_module->save_config_info) + printk(KERN_ERR "Huh? Module %s appears to have a " + "save_config_info, but not a load_config_info " + "function!\n", this_module->name); + else + this_module->load_config_info(buffer, len); + + /* + * Now move this module to the tail of its lists. This will put it in + * order. Any new modules will end up at the top of the lists. They + * should have been set to disabled when loaded (people will + * normally not edit an initrd to load a new module and then hibernate + * without using it!). + */ + + toi_move_module_tail(this_module); + + this_module->enabled = header->enabled; + +out: + toi_free_page(23, (unsigned long) buffer); + return 0; +} + +/** + * read_module_configs - reload module configurations from the image header. + * + * Returns: Int + * Zero on success or an error code. + **/ +static int read_module_configs(void) +{ + int result = 0; + struct toi_module_header toi_module_header; + struct toi_module_ops *this_module; + + /* All modules are initially disabled. That way, if we have a module + * loaded now that wasn't loaded when we hibernated, it won't be used + * in trying to read the data. + */ + list_for_each_entry(this_module, &toi_modules, module_list) + this_module->enabled = 0; + + /* Get the first module header */ + result = toiActiveAllocator->rw_header_chunk(READ, NULL, + (char *) &toi_module_header, + sizeof(toi_module_header)); + if (result) { + printk(KERN_ERR "Failed to read the next module header.\n"); + return -EINVAL; + } + + /* For each module (in registration order) */ + while (toi_module_header.name[0]) { + result = read_one_module_config(&toi_module_header); + + if (result) + return -EINVAL; + + /* Get the next module header */ + result = toiActiveAllocator->rw_header_chunk(READ, NULL, + (char *) &toi_module_header, + sizeof(toi_module_header)); + + if (result) { + printk(KERN_ERR "Failed to read the next module " + "header.\n"); + return -EINVAL; + } + } + + return 0; +} + +static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev) +{ + return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1; +} + +int fs_info_space_needed(void) +{ + const struct super_block *sb; + int result = sizeof(int); + + list_for_each_entry(sb, &super_blocks, s_list) { + struct fs_info *fs; + + if (!sb->s_bdev) + continue; + + fs = fs_info_from_block_dev(sb->s_bdev); + if (save_fs_info(fs, sb->s_bdev)) + result += 16 + sizeof(dev_t) + sizeof(int) + + fs->last_mount_size; + free_fs_info(fs); + } + return result; +} + +static int fs_info_num_to_save(void) +{ + const struct super_block *sb; + int to_save = 0; + + list_for_each_entry(sb, &super_blocks, s_list) { + struct fs_info *fs; + + if (!sb->s_bdev) + continue; + + fs = fs_info_from_block_dev(sb->s_bdev); + if (save_fs_info(fs, sb->s_bdev)) + to_save++; + free_fs_info(fs); + } + + return to_save; +} + +static int fs_info_save(void) +{ + const struct super_block *sb; + int to_save = fs_info_num_to_save(); + + if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save, + sizeof(int))) { + abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info" + " to save."); + return -EIO; + } + + list_for_each_entry(sb, &super_blocks, s_list) { + struct fs_info *fs; + + if (!sb->s_bdev) + continue; + + fs = fs_info_from_block_dev(sb->s_bdev); + if (save_fs_info(fs, sb->s_bdev)) { + if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, + &fs->uuid[0], 16)) { + abort_hibernate(TOI_FAILED_IO, "Failed to " + "write uuid."); + return -EIO; + } + if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, + (char *) &fs->dev_t, sizeof(dev_t))) { + abort_hibernate(TOI_FAILED_IO, "Failed to " + "write dev_t."); + return -EIO; + } + if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, + (char *) &fs->last_mount_size, sizeof(int))) { + abort_hibernate(TOI_FAILED_IO, "Failed to " + "write last mount length."); + return -EIO; + } + if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, + fs->last_mount, fs->last_mount_size)) { + abort_hibernate(TOI_FAILED_IO, "Failed to " + "write uuid."); + return -EIO; + } + } + free_fs_info(fs); + } + return 0; +} + +static int fs_info_load_and_check_one(void) +{ + char uuid[16], *last_mount; + int result = 0, ln; + dev_t dev_t; + struct block_device *dev; + struct fs_info *fs_info, seek; + + if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) { + abort_hibernate(TOI_FAILED_IO, "Failed to read uuid."); + return -EIO; + } + + read_if_version(3, dev_t, "uuid dev_t field", return -EIO); + + if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln, + sizeof(int))) { + abort_hibernate(TOI_FAILED_IO, + "Failed to read last mount size."); + return -EIO; + } + + last_mount = kzalloc(ln, GFP_KERNEL); + + if (!last_mount) + return -ENOMEM; + + if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount, ln)) { + abort_hibernate(TOI_FAILED_IO, + "Failed to read last mount timestamp."); + result = -EIO; + goto out_lmt; + } + + strncpy((char *) &seek.uuid, uuid, 16); + seek.dev_t = dev_t; + seek.last_mount_size = ln; + seek.last_mount = last_mount; + dev_t = blk_lookup_fs_info(&seek); + if (!dev_t) + goto out_lmt; + + dev = toi_open_by_devnum(dev_t); + + fs_info = fs_info_from_block_dev(dev); + if (fs_info && !IS_ERR(fs_info)) { + if (ln != fs_info->last_mount_size) { + printk(KERN_EMERG "Found matching uuid but last mount " + "time lengths differ?! " + "(%d vs %d).\n", ln, + fs_info->last_mount_size); + result = -EINVAL; + } else { + char buf[BDEVNAME_SIZE]; + result = !!memcmp(fs_info->last_mount, last_mount, ln); + if (result) + printk(KERN_EMERG "Last mount time for %s has " + "changed!\n", bdevname(dev, buf)); + } + } + toi_close_bdev(dev); + free_fs_info(fs_info); +out_lmt: + kfree(last_mount); + return result; +} + +static int fs_info_load_and_check(void) +{ + int to_do, result = 0; + + if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do, + sizeof(int))) { + abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info " + "to load."); + return -EIO; + } + + while(to_do--) + result |= fs_info_load_and_check_one(); + + return result; +} + +/** + * write_image_header - write the image header after write the image proper + * + * Returns: Int + * Zero on success, error value otherwise. + **/ +int write_image_header(void) +{ + int ret; + int total = pagedir1.size + pagedir2.size+2; + char *header_buffer = NULL; + + /* Now prepare to write the header */ + ret = toiActiveAllocator->write_header_init(); + if (ret) { + abort_hibernate(TOI_FAILED_MODULE_INIT, + "Active allocator's write_header_init" + " function failed."); + goto write_image_header_abort; + } + + /* Get a buffer */ + header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP); + if (!header_buffer) { + abort_hibernate(TOI_OUT_OF_MEMORY, + "Out of memory when trying to get page for header!"); + goto write_image_header_abort; + } + + /* Write hibernate header */ + if (fill_toi_header((struct toi_header *) header_buffer)) { + abort_hibernate(TOI_OUT_OF_MEMORY, + "Failure to fill header information!"); + goto write_image_header_abort; + } + + if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, + header_buffer, sizeof(struct toi_header))) { + abort_hibernate(TOI_OUT_OF_MEMORY, + "Failure to write header info."); + goto write_image_header_abort; + } + + if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, + (char *) &toi_max_workers, sizeof(toi_max_workers))) { + abort_hibernate(TOI_OUT_OF_MEMORY, + "Failure to number of workers to use."); + goto write_image_header_abort; + } + + /* Write filesystem info */ + if (fs_info_save()) + goto write_image_header_abort; + + /* Write module configurations */ + ret = write_module_configs(); + if (ret) { + abort_hibernate(TOI_FAILED_IO, + "Failed to write module configs."); + goto write_image_header_abort; + } + + if (memory_bm_write(pageset1_map, + toiActiveAllocator->rw_header_chunk)) { + abort_hibernate(TOI_FAILED_IO, + "Failed to write bitmaps."); + goto write_image_header_abort; + } + + /* Flush data and let allocator cleanup */ + if (toiActiveAllocator->write_header_cleanup()) { + abort_hibernate(TOI_FAILED_IO, + "Failed to cleanup writing header."); + goto write_image_header_abort_no_cleanup; + } + + if (test_result_state(TOI_ABORTED)) + goto write_image_header_abort_no_cleanup; + + toi_update_status(total, total, NULL); + +out: + if (header_buffer) + toi_free_page(24, (unsigned long) header_buffer); + return ret; + +write_image_header_abort: + toiActiveAllocator->write_header_cleanup(); +write_image_header_abort_no_cleanup: + ret = -1; + goto out; +} + +/** + * sanity_check - check the header + * @sh: the header which was saved at hibernate time. + * + * Perform a few checks, seeking to ensure that the kernel being + * booted matches the one hibernated. They need to match so we can + * be _sure_ things will work. It is not absolutely impossible for + * resuming from a different kernel to work, just not assured. + **/ +static char *sanity_check(struct toi_header *sh) +{ + char *reason = check_image_kernel((struct swsusp_info *) sh); + + if (reason) + return reason; + + if (!test_action_state(TOI_IGNORE_ROOTFS)) { + const struct super_block *sb; + list_for_each_entry(sb, &super_blocks, s_list) { + if ((!(sb->s_flags & MS_RDONLY)) && + (sb->s_type->fs_flags & FS_REQUIRES_DEV)) + return "Device backed fs has been mounted " + "rw prior to resume or initrd/ramfs " + "is mounted rw."; + } + } + + return NULL; +} + +static DECLARE_WAIT_QUEUE_HEAD(freeze_wait); + +#define FREEZE_IN_PROGRESS (~0) + +static int freeze_result; + +static void do_freeze(struct work_struct *dummy) +{ + freeze_result = freeze_processes(); + wake_up(&freeze_wait); + trap_non_toi_io = 1; +} + +static DECLARE_WORK(freeze_work, do_freeze); + +/** + * __read_pageset1 - test for the existence of an image and attempt to load it + * + * Returns: Int + * Zero if image found and pageset1 successfully loaded. + * Error if no image found or loaded. + **/ +static int __read_pageset1(void) +{ + int i, result = 0; + char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP), + *sanity_error = NULL; + struct toi_header *toi_header; + + if (!header_buffer) { + printk(KERN_INFO "Unable to allocate a page for reading the " + "signature.\n"); + return -ENOMEM; + } + + /* Check for an image */ + result = toiActiveAllocator->image_exists(1); + if (result == 3) { + result = -ENODATA; + toi_early_boot_message(1, 0, "The signature from an older " + "version of TuxOnIce has been detected."); + goto out_remove_image; + } + + if (result != 1) { + result = -ENODATA; + noresume_reset_modules(); + printk(KERN_INFO "TuxOnIce: No image found.\n"); + goto out; + } + + /* + * Prepare the active allocator for reading the image header. The + * activate allocator might read its own configuration. + * + * NB: This call may never return because there might be a signature + * for a different image such that we warn the user and they choose + * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the + * location of the image might be unavailable if it was stored on a + * network connection). + */ + + result = toiActiveAllocator->read_header_init(); + if (result) { + printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the " + "image header.\n"); + goto out_remove_image; + } + + /* Check for noresume command line option */ + if (test_toi_state(TOI_NORESUME_SPECIFIED)) { + printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed " + "image.\n"); + goto out_remove_image; + } + + /* Check whether we've resumed before */ + if (test_toi_state(TOI_RESUMED_BEFORE)) { + toi_early_boot_message(1, 0, NULL); + if (!(test_toi_state(TOI_CONTINUE_REQ))) { + printk(KERN_INFO "TuxOnIce: Tried to resume before: " + "Invalidated image.\n"); + goto out_remove_image; + } + } + + clear_toi_state(TOI_CONTINUE_REQ); + + toi_image_header_version = toiActiveAllocator->get_header_version(); + + if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) { + toi_early_boot_message(1, 0, image_version_error); + if (!(test_toi_state(TOI_CONTINUE_REQ))) { + printk(KERN_INFO "TuxOnIce: Header version too new: " + "Invalidated image.\n"); + goto out_remove_image; + } + } + + /* Read hibernate header */ + result = toiActiveAllocator->rw_header_chunk(READ, NULL, + header_buffer, sizeof(struct toi_header)); + if (result < 0) { + printk(KERN_ERR "TuxOnIce: Failed to read the image " + "signature.\n"); + goto out_remove_image; + } + + toi_header = (struct toi_header *) header_buffer; + + /* + * NB: This call may also result in a reboot rather than returning. + */ + + sanity_error = sanity_check(toi_header); + if (sanity_error) { + toi_early_boot_message(1, TOI_CONTINUE_REQ, + sanity_error); + printk(KERN_INFO "TuxOnIce: Sanity check failed.\n"); + goto out_remove_image; + } + + /* + * We have an image and it looks like it will load okay. + * + * Get metadata from header. Don't override commandline parameters. + * + * We don't need to save the image size limit because it's not used + * during resume and will be restored with the image anyway. + */ + + memcpy((char *) &pagedir1, + (char *) &toi_header->pagedir, sizeof(pagedir1)); + toi_result = toi_header->param0; + if (!toi_bkd.toi_debug_state) { + toi_bkd.toi_action = + (toi_header->param1 & ~toi_bootflags_mask) | + (toi_bkd.toi_action & toi_bootflags_mask); + toi_bkd.toi_debug_state = toi_header->param2; + toi_bkd.toi_default_console_level = toi_header->param3; + } + clear_toi_state(TOI_IGNORE_LOGLEVEL); + pagedir2.size = toi_header->pageset_2_size; + for (i = 0; i < 4; i++) + toi_bkd.toi_io_time[i/2][i%2] = + toi_header->io_time[i/2][i%2]; + + set_toi_state(TOI_BOOT_KERNEL); + boot_kernel_data_buffer = toi_header->bkd; + + read_if_version(1, toi_max_workers, "TuxOnIce max workers", + goto out_remove_image); + + /* Read filesystem info */ + if (fs_info_load_and_check()) { + printk(KERN_EMERG "TuxOnIce: File system mount time checks " + "failed. Refusing to corrupt your filesystems!\n"); + goto out_remove_image; + } + + /* Read module configurations */ + result = read_module_configs(); + if (result) { + pagedir1.size = 0; + pagedir2.size = 0; + printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module " + "configurations.\n"); + clear_action_state(TOI_KEEP_IMAGE); + goto out_remove_image; + } + + toi_prepare_console(); + + set_toi_state(TOI_NOW_RESUMING); + + result = pm_notifier_call_chain(PM_RESTORE_PREPARE); + if (result) + goto out_notifier_call_chain;; + + if (usermodehelper_disable()) + goto out_enable_usermodehelper; + + current->flags |= PF_NOFREEZE; + freeze_result = FREEZE_IN_PROGRESS; + + schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work); + + toi_cond_pause(1, "About to read original pageset1 locations."); + + /* + * See _toi_rw_header_chunk in tuxonice_bio.c: + * Initialize pageset1_map by reading the map from the image. + */ + if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk)) + goto out_thaw; + + /* + * See toi_rw_cleanup in tuxonice_bio.c: + * Clean up after reading the header. + */ + result = toiActiveAllocator->read_header_cleanup(); + if (result) { + printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the " + "image header.\n"); + goto out_thaw; + } + + toi_cond_pause(1, "About to read pagedir."); + + /* + * Get the addresses of pages into which we will load the kernel to + * be copied back and check if they conflict with the ones we are using. + */ + if (toi_get_pageset1_load_addresses()) { + printk(KERN_INFO "TuxOnIce: Failed to get load addresses for " + "pageset1.\n"); + goto out_thaw; + } + + /* Read the original kernel back */ + toi_cond_pause(1, "About to read pageset 1."); + + /* Given the pagemap, read back the data from disk */ + if (read_pageset(&pagedir1, 0)) { + toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1."); + result = -EIO; + goto out_thaw; + } + + toi_cond_pause(1, "About to restore original kernel."); + result = 0; + + if (!toi_keeping_image && + toiActiveAllocator->mark_resume_attempted) + toiActiveAllocator->mark_resume_attempted(1); + + wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); +out: + current->flags &= ~PF_NOFREEZE; + toi_free_page(25, (unsigned long) header_buffer); + return result; + +out_thaw: + wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); + trap_non_toi_io = 0; + thaw_processes(); +out_enable_usermodehelper: + usermodehelper_enable(); +out_notifier_call_chain: + pm_notifier_call_chain(PM_POST_RESTORE); + toi_cleanup_console(); +out_remove_image: + result = -EINVAL; + if (!toi_keeping_image) + toiActiveAllocator->remove_image(); + toiActiveAllocator->read_header_cleanup(); + noresume_reset_modules(); + goto out; +} + +/** + * read_pageset1 - highlevel function to read the saved pages + * + * Attempt to read the header and pageset1 of a hibernate image. + * Handle the outcome, complaining where appropriate. + **/ +int read_pageset1(void) +{ + int error; + + error = __read_pageset1(); + + if (error && error != -ENODATA && error != -EINVAL && + !test_result_state(TOI_ABORTED)) + abort_hibernate(TOI_IMAGE_ERROR, + "TuxOnIce: Error %d resuming\n", error); + + return error; +} + +/** + * get_have_image_data - check the image header + **/ +static char *get_have_image_data(void) +{ + char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP); + struct toi_header *toi_header; + + if (!output_buffer) { + printk(KERN_INFO "Output buffer null.\n"); + return NULL; + } + + /* Check for an image */ + if (!toiActiveAllocator->image_exists(1) || + toiActiveAllocator->read_header_init() || + toiActiveAllocator->rw_header_chunk(READ, NULL, + output_buffer, sizeof(struct toi_header))) { + sprintf(output_buffer, "0\n"); + /* + * From an initrd/ramfs, catting have_image and + * getting a result of 0 is sufficient. + */ + clear_toi_state(TOI_BOOT_TIME); + goto out; + } + + toi_header = (struct toi_header *) output_buffer; + + sprintf(output_buffer, "1\n%s\n%s\n", + toi_header->uts.machine, + toi_header->uts.version); + + /* Check whether we've resumed before */ + if (test_toi_state(TOI_RESUMED_BEFORE)) + strcat(output_buffer, "Resumed before.\n"); + +out: + noresume_reset_modules(); + return output_buffer; +} + +/** + * read_pageset2 - read second part of the image + * @overwrittenpagesonly: Read only pages which would have been + * verwritten by pageset1? + * + * Read in part or all of pageset2 of an image, depending upon + * whether we are hibernating and have only overwritten a portion + * with pageset1 pages, or are resuming and need to read them + * all. + * + * Returns: Int + * Zero if no error, otherwise the error value. + **/ +int read_pageset2(int overwrittenpagesonly) +{ + int result = 0; + + if (!pagedir2.size) + return 0; + + result = read_pageset(&pagedir2, overwrittenpagesonly); + + toi_cond_pause(1, "Pagedir 2 read."); + + return result; +} + +/** + * image_exists_read - has an image been found? + * @page: Output buffer + * + * Store 0 or 1 in page, depending on whether an image is found. + * Incoming buffer is PAGE_SIZE and result is guaranteed + * to be far less than that, so we don't worry about + * overflow. + **/ +int image_exists_read(const char *page, int count) +{ + int len = 0; + char *result; + + if (toi_activate_storage(0)) + return count; + + if (!test_toi_state(TOI_RESUME_DEVICE_OK)) + toi_attempt_to_parse_resume_device(0); + + if (!toiActiveAllocator) { + len = sprintf((char *) page, "-1\n"); + } else { + result = get_have_image_data(); + if (result) { + len = sprintf((char *) page, "%s", result); + toi_free_page(26, (unsigned long) result); + } + } + + toi_deactivate_storage(0); + + return len; +} + +/** + * image_exists_write - invalidate an image if one exists + **/ +int image_exists_write(const char *buffer, int count) +{ + if (toi_activate_storage(0)) + return count; + + if (toiActiveAllocator && toiActiveAllocator->image_exists(1)) + toiActiveAllocator->remove_image(); + + toi_deactivate_storage(0); + + clear_result_state(TOI_KEPT_IMAGE); + + return count; +} diff -Nur linux-4.2/kernel/power/tuxonice_io.h linux-4.2-pck/kernel/power/tuxonice_io.h --- linux-4.2/kernel/power/tuxonice_io.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_io.h 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,72 @@ +/* + * kernel/power/tuxonice_io.h + * + * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * It contains high level IO routines for hibernating. + * + */ + +#include +#include "tuxonice_pagedir.h" + +/* Non-module data saved in our image header */ +struct toi_header { + /* + * Mirror struct swsusp_info, but without + * the page aligned attribute + */ + struct new_utsname uts; + u32 version_code; + unsigned long num_physpages; + int cpus; + unsigned long image_pages; + unsigned long pages; + unsigned long size; + + /* Our own data */ + unsigned long orig_mem_free; + int page_size; + int pageset_2_size; + int param0; + int param1; + int param2; + int param3; + int progress0; + int progress1; + int progress2; + int progress3; + int io_time[2][2]; + struct pagedir pagedir; + dev_t root_fs; + unsigned long bkd; /* Boot kernel data locn */ +}; + +extern int write_pageset(struct pagedir *pagedir); +extern int write_image_header(void); +extern int read_pageset1(void); +extern int read_pageset2(int overwrittenpagesonly); + +extern int toi_attempt_to_parse_resume_device(int quiet); +extern void attempt_to_parse_resume_device2(void); +extern void attempt_to_parse_alt_resume_param(void); +int image_exists_read(const char *page, int count); +int image_exists_write(const char *buffer, int count); +extern void save_restore_alt_param(int replace, int quiet); +extern atomic_t toi_io_workers; + +/* Args to save_restore_alt_param */ +#define RESTORE 0 +#define SAVE 1 + +#define NOQUIET 0 +#define QUIET 1 + +extern wait_queue_head_t toi_io_queue_flusher; +extern int toi_bio_queue_flusher_should_finish; + +int fs_info_space_needed(void); + +extern int toi_max_workers; diff -Nur linux-4.2/kernel/power/tuxonice_modules.c linux-4.2-pck/kernel/power/tuxonice_modules.c --- linux-4.2/kernel/power/tuxonice_modules.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_modules.c 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,520 @@ +/* + * kernel/power/tuxonice_modules.c + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + */ + +#include +#include +#include "tuxonice.h" +#include "tuxonice_modules.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_ui.h" + +LIST_HEAD(toi_filters); +LIST_HEAD(toiAllocators); + +LIST_HEAD(toi_modules); + +struct toi_module_ops *toiActiveAllocator; + +static int toi_num_filters; +int toiNumAllocators, toi_num_modules; + +/* + * toi_header_storage_for_modules + * + * Returns the amount of space needed to store configuration + * data needed by the modules prior to copying back the original + * kernel. We can exclude data for pageset2 because it will be + * available anyway once the kernel is copied back. + */ +long toi_header_storage_for_modules(void) +{ + struct toi_module_ops *this_module; + int bytes = 0; + + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!this_module->enabled || + (this_module->type == WRITER_MODULE && + toiActiveAllocator != this_module)) + continue; + if (this_module->storage_needed) { + int this = this_module->storage_needed() + + sizeof(struct toi_module_header) + + sizeof(int); + this_module->header_requested = this; + bytes += this; + } + } + + /* One more for the empty terminator */ + return bytes + sizeof(struct toi_module_header); +} + +void print_toi_header_storage_for_modules(void) +{ + struct toi_module_ops *this_module; + int bytes = 0; + + printk(KERN_DEBUG "Header storage:\n"); + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!this_module->enabled || + (this_module->type == WRITER_MODULE && + toiActiveAllocator != this_module)) + continue; + if (this_module->storage_needed) { + int this = this_module->storage_needed() + + sizeof(struct toi_module_header) + + sizeof(int); + this_module->header_requested = this; + bytes += this; + printk(KERN_DEBUG "+ %16s : %-4d/%d.\n", + this_module->name, + this_module->header_used, this); + } + } + + printk(KERN_DEBUG "+ empty terminator : %zu.\n", + sizeof(struct toi_module_header)); + printk(KERN_DEBUG " ====\n"); + printk(KERN_DEBUG " %zu\n", + bytes + sizeof(struct toi_module_header)); +} + +/* + * toi_memory_for_modules + * + * Returns the amount of memory requested by modules for + * doing their work during the cycle. + */ + +long toi_memory_for_modules(int print_parts) +{ + long bytes = 0, result; + struct toi_module_ops *this_module; + + if (print_parts) + printk(KERN_INFO "Memory for modules:\n===================\n"); + list_for_each_entry(this_module, &toi_modules, module_list) { + int this; + if (!this_module->enabled) + continue; + if (this_module->memory_needed) { + this = this_module->memory_needed(); + if (print_parts) + printk(KERN_INFO "%10d bytes (%5ld pages) for " + "module '%s'.\n", this, + DIV_ROUND_UP(this, PAGE_SIZE), + this_module->name); + bytes += this; + } + } + + result = DIV_ROUND_UP(bytes, PAGE_SIZE); + if (print_parts) + printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result); + + return result; +} + +/* + * toi_expected_compression_ratio + * + * Returns the compression ratio expected when saving the image. + */ + +int toi_expected_compression_ratio(void) +{ + int ratio = 100; + struct toi_module_ops *this_module; + + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!this_module->enabled) + continue; + if (this_module->expected_compression) + ratio = ratio * this_module->expected_compression() + / 100; + } + + return ratio; +} + +/* toi_find_module_given_dir + * Functionality : Return a module (if found), given a pointer + * to its directory name + */ + +static struct toi_module_ops *toi_find_module_given_dir(char *name) +{ + struct toi_module_ops *this_module, *found_module = NULL; + + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!strcmp(name, this_module->directory)) { + found_module = this_module; + break; + } + } + + return found_module; +} + +/* toi_find_module_given_name + * Functionality : Return a module (if found), given a pointer + * to its name + */ + +struct toi_module_ops *toi_find_module_given_name(char *name) +{ + struct toi_module_ops *this_module, *found_module = NULL; + + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!strcmp(name, this_module->name)) { + found_module = this_module; + break; + } + } + + return found_module; +} + +/* + * toi_print_module_debug_info + * Functionality : Get debugging info from modules into a buffer. + */ +int toi_print_module_debug_info(char *buffer, int buffer_size) +{ + struct toi_module_ops *this_module; + int len = 0; + + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!this_module->enabled) + continue; + if (this_module->print_debug_info) { + int result; + result = this_module->print_debug_info(buffer + len, + buffer_size - len); + len += result; + } + } + + /* Ensure null terminated */ + buffer[buffer_size] = 0; + + return len; +} + +/* + * toi_register_module + * + * Register a module. + */ +int toi_register_module(struct toi_module_ops *module) +{ + int i; + struct kobject *kobj; + + if (!hibernation_available()) + return -ENODEV; + + module->enabled = 1; + + if (toi_find_module_given_name(module->name)) { + printk(KERN_INFO "TuxOnIce: Trying to load module %s," + " which is already registered.\n", + module->name); + return -EBUSY; + } + + switch (module->type) { + case FILTER_MODULE: + list_add_tail(&module->type_list, &toi_filters); + toi_num_filters++; + break; + case WRITER_MODULE: + list_add_tail(&module->type_list, &toiAllocators); + toiNumAllocators++; + break; + case MISC_MODULE: + case MISC_HIDDEN_MODULE: + case BIO_ALLOCATOR_MODULE: + break; + default: + printk(KERN_ERR "Hmmm. Module '%s' has an invalid type." + " It has been ignored.\n", module->name); + return -EINVAL; + } + list_add_tail(&module->module_list, &toi_modules); + toi_num_modules++; + + if ((!module->directory && !module->shared_directory) || + !module->sysfs_data || !module->num_sysfs_entries) + return 0; + + /* + * Modules may share a directory, but those with shared_dir + * set must be loaded (via symbol dependencies) after parents + * and unloaded beforehand. + */ + if (module->shared_directory) { + struct toi_module_ops *shared = + toi_find_module_given_dir(module->shared_directory); + if (!shared) { + printk(KERN_ERR "TuxOnIce: Module %s wants to share " + "%s's directory but %s isn't loaded.\n", + module->name, module->shared_directory, + module->shared_directory); + toi_unregister_module(module); + return -ENODEV; + } + kobj = shared->dir_kobj; + } else { + if (!strncmp(module->directory, "[ROOT]", 6)) + kobj = tuxonice_kobj; + else + kobj = make_toi_sysdir(module->directory); + } + module->dir_kobj = kobj; + for (i = 0; i < module->num_sysfs_entries; i++) { + int result = toi_register_sysfs_file(kobj, + &module->sysfs_data[i]); + if (result) + return result; + } + return 0; +} + +/* + * toi_unregister_module + * + * Remove a module. + */ +void toi_unregister_module(struct toi_module_ops *module) +{ + int i; + + if (module->dir_kobj) + for (i = 0; i < module->num_sysfs_entries; i++) + toi_unregister_sysfs_file(module->dir_kobj, + &module->sysfs_data[i]); + + if (!module->shared_directory && module->directory && + strncmp(module->directory, "[ROOT]", 6)) + remove_toi_sysdir(module->dir_kobj); + + switch (module->type) { + case FILTER_MODULE: + list_del(&module->type_list); + toi_num_filters--; + break; + case WRITER_MODULE: + list_del(&module->type_list); + toiNumAllocators--; + if (toiActiveAllocator == module) { + toiActiveAllocator = NULL; + clear_toi_state(TOI_CAN_RESUME); + clear_toi_state(TOI_CAN_HIBERNATE); + } + break; + case MISC_MODULE: + case MISC_HIDDEN_MODULE: + case BIO_ALLOCATOR_MODULE: + break; + default: + printk(KERN_ERR "Module '%s' has an invalid type." + " It has been ignored.\n", module->name); + return; + } + list_del(&module->module_list); + toi_num_modules--; +} + +/* + * toi_move_module_tail + * + * Rearrange modules when reloading the config. + */ +void toi_move_module_tail(struct toi_module_ops *module) +{ + switch (module->type) { + case FILTER_MODULE: + if (toi_num_filters > 1) + list_move_tail(&module->type_list, &toi_filters); + break; + case WRITER_MODULE: + if (toiNumAllocators > 1) + list_move_tail(&module->type_list, &toiAllocators); + break; + case MISC_MODULE: + case MISC_HIDDEN_MODULE: + case BIO_ALLOCATOR_MODULE: + break; + default: + printk(KERN_ERR "Module '%s' has an invalid type." + " It has been ignored.\n", module->name); + return; + } + if ((toi_num_filters + toiNumAllocators) > 1) + list_move_tail(&module->module_list, &toi_modules); +} + +/* + * toi_initialise_modules + * + * Get ready to do some work! + */ +int toi_initialise_modules(int starting_cycle, int early) +{ + struct toi_module_ops *this_module; + int result; + + list_for_each_entry(this_module, &toi_modules, module_list) { + this_module->header_requested = 0; + this_module->header_used = 0; + if (!this_module->enabled) + continue; + if (this_module->early != early) + continue; + if (this_module->initialise) { + result = this_module->initialise(starting_cycle); + if (result) { + toi_cleanup_modules(starting_cycle); + return result; + } + this_module->initialised = 1; + } + } + + return 0; +} + +/* + * toi_cleanup_modules + * + * Tell modules the work is done. + */ +void toi_cleanup_modules(int finishing_cycle) +{ + struct toi_module_ops *this_module; + + list_for_each_entry(this_module, &toi_modules, module_list) { + if (!this_module->enabled || !this_module->initialised) + continue; + if (this_module->cleanup) + this_module->cleanup(finishing_cycle); + this_module->initialised = 0; + } +} + +/* + * toi_pre_atomic_restore_modules + * + * Get ready to do some work! + */ +void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd) +{ + struct toi_module_ops *this_module; + + list_for_each_entry(this_module, &toi_modules, module_list) { + if (this_module->enabled && this_module->pre_atomic_restore) + this_module->pre_atomic_restore(bkd); + } +} + +/* + * toi_post_atomic_restore_modules + * + * Get ready to do some work! + */ +void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd) +{ + struct toi_module_ops *this_module; + + list_for_each_entry(this_module, &toi_modules, module_list) { + if (this_module->enabled && this_module->post_atomic_restore) + this_module->post_atomic_restore(bkd); + } +} + +/* + * toi_get_next_filter + * + * Get the next filter in the pipeline. + */ +struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought) +{ + struct toi_module_ops *last_filter = NULL, *this_filter = NULL; + + list_for_each_entry(this_filter, &toi_filters, type_list) { + if (!this_filter->enabled) + continue; + if ((last_filter == filter_sought) || (!filter_sought)) + return this_filter; + last_filter = this_filter; + } + + return toiActiveAllocator; +} + +/** + * toi_show_modules: Printk what support is loaded. + */ +void toi_print_modules(void) +{ + struct toi_module_ops *this_module; + int prev = 0; + + printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for"); + + list_for_each_entry(this_module, &toi_modules, module_list) { + if (this_module->type == MISC_HIDDEN_MODULE) + continue; + printk("%s %s%s%s", prev ? "," : "", + this_module->enabled ? "" : "[", + this_module->name, + this_module->enabled ? "" : "]"); + prev = 1; + } + + printk(".\n"); +} + +/* toi_get_modules + * + * Take a reference to modules so they can't go away under us. + */ + +int toi_get_modules(void) +{ + struct toi_module_ops *this_module; + + list_for_each_entry(this_module, &toi_modules, module_list) { + struct toi_module_ops *this_module2; + + if (try_module_get(this_module->module)) + continue; + + /* Failed! Reverse gets and return error */ + list_for_each_entry(this_module2, &toi_modules, + module_list) { + if (this_module == this_module2) + return -EINVAL; + module_put(this_module2->module); + } + } + return 0; +} + +/* toi_put_modules + * + * Release our references to modules we used. + */ + +void toi_put_modules(void) +{ + struct toi_module_ops *this_module; + + list_for_each_entry(this_module, &toi_modules, module_list) + module_put(this_module->module); +} diff -Nur linux-4.2/kernel/power/tuxonice_modules.h linux-4.2-pck/kernel/power/tuxonice_modules.h --- linux-4.2/kernel/power/tuxonice_modules.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_modules.h 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,212 @@ +/* + * kernel/power/tuxonice_modules.h + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * It contains declarations for modules. Modules are additions to + * TuxOnIce that provide facilities such as image compression or + * encryption, backends for storage of the image and user interfaces. + * + */ + +#ifndef TOI_MODULES_H +#define TOI_MODULES_H + +/* This is the maximum size we store in the image header for a module name */ +#define TOI_MAX_MODULE_NAME_LENGTH 30 + +struct toi_boot_kernel_data; + +/* Per-module metadata */ +struct toi_module_header { + char name[TOI_MAX_MODULE_NAME_LENGTH]; + int enabled; + int type; + int index; + int data_length; + unsigned long signature; +}; + +enum { + FILTER_MODULE, + WRITER_MODULE, + BIO_ALLOCATOR_MODULE, + MISC_MODULE, + MISC_HIDDEN_MODULE, +}; + +enum { + TOI_ASYNC, + TOI_SYNC +}; + +enum { + TOI_VIRT, + TOI_PAGE, +}; + +#define TOI_MAP(type, addr) \ + (type == TOI_PAGE ? kmap(addr) : addr) + +#define TOI_UNMAP(type, addr) \ + do { \ + if (type == TOI_PAGE) \ + kunmap(addr); \ + } while(0) + +struct toi_module_ops { + /* Functions common to all modules */ + int type; + char *name; + char *directory; + char *shared_directory; + struct kobject *dir_kobj; + struct module *module; + int enabled, early, initialised; + struct list_head module_list; + + /* List of filters or allocators */ + struct list_head list, type_list; + + /* + * Requirements for memory and storage in + * the image header.. + */ + int (*memory_needed) (void); + int (*storage_needed) (void); + + int header_requested, header_used; + + int (*expected_compression) (void); + + /* + * Debug info + */ + int (*print_debug_info) (char *buffer, int size); + int (*save_config_info) (char *buffer); + void (*load_config_info) (char *buffer, int len); + + /* + * Initialise & cleanup - general routines called + * at the start and end of a cycle. + */ + int (*initialise) (int starting_cycle); + void (*cleanup) (int finishing_cycle); + + void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd); + void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd); + + /* + * Calls for allocating storage (allocators only). + * + * Header space is requested separately and cannot fail, but the + * reservation is only applied when main storage is allocated. + * The header space reservation is thus always set prior to + * requesting the allocation of storage - and prior to querying + * how much storage is available. + */ + + unsigned long (*storage_available) (void); + void (*reserve_header_space) (unsigned long space_requested); + int (*register_storage) (void); + int (*allocate_storage) (unsigned long space_requested); + unsigned long (*storage_allocated) (void); + void (*free_unused_storage) (void); + + /* + * Routines used in image I/O. + */ + int (*rw_init) (int rw, int stream_number); + int (*rw_cleanup) (int rw); + int (*write_page) (unsigned long index, int buf_type, void *buf, + unsigned int buf_size); + int (*read_page) (unsigned long *index, int buf_type, void *buf, + unsigned int *buf_size); + int (*io_flusher) (int rw); + + /* Reset module if image exists but reading aborted */ + void (*noresume_reset) (void); + + /* Read and write the metadata */ + int (*write_header_init) (void); + int (*write_header_cleanup) (void); + + int (*read_header_init) (void); + int (*read_header_cleanup) (void); + + /* To be called after read_header_init */ + int (*get_header_version) (void); + + int (*rw_header_chunk) (int rw, struct toi_module_ops *owner, + char *buffer_start, int buffer_size); + + int (*rw_header_chunk_noreadahead) (int rw, + struct toi_module_ops *owner, char *buffer_start, + int buffer_size); + + /* Attempt to parse an image location */ + int (*parse_sig_location) (char *buffer, int only_writer, int quiet); + + /* Throttle I/O according to throughput */ + void (*update_throughput_throttle) (int jif_index); + + /* Flush outstanding I/O */ + int (*finish_all_io) (void); + + /* Determine whether image exists that we can restore */ + int (*image_exists) (int quiet); + + /* Mark the image as having tried to resume */ + int (*mark_resume_attempted) (int); + + /* Destroy image if one exists */ + int (*remove_image) (void); + + /* Sysfs Data */ + struct toi_sysfs_data *sysfs_data; + int num_sysfs_entries; + + /* Block I/O allocator */ + struct toi_bio_allocator_ops *bio_allocator_ops; +}; + +extern int toi_num_modules, toiNumAllocators; + +extern struct toi_module_ops *toiActiveAllocator; +extern struct list_head toi_filters, toiAllocators, toi_modules; + +extern void toi_prepare_console_modules(void); +extern void toi_cleanup_console_modules(void); + +extern struct toi_module_ops *toi_find_module_given_name(char *name); +extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *); + +extern int toi_register_module(struct toi_module_ops *module); +extern void toi_move_module_tail(struct toi_module_ops *module); + +extern long toi_header_storage_for_modules(void); +extern long toi_memory_for_modules(int print_parts); +extern void print_toi_header_storage_for_modules(void); +extern int toi_expected_compression_ratio(void); + +extern int toi_print_module_debug_info(char *buffer, int buffer_size); +extern int toi_register_module(struct toi_module_ops *module); +extern void toi_unregister_module(struct toi_module_ops *module); + +extern int toi_initialise_modules(int starting_cycle, int early); +#define toi_initialise_modules_early(starting) \ + toi_initialise_modules(starting, 1) +#define toi_initialise_modules_late(starting) \ + toi_initialise_modules(starting, 0) +extern void toi_cleanup_modules(int finishing_cycle); + +extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd); +extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd); + +extern void toi_print_modules(void); + +int toi_get_modules(void); +void toi_put_modules(void); +#endif diff -Nur linux-4.2/kernel/power/tuxonice_netlink.c linux-4.2-pck/kernel/power/tuxonice_netlink.c --- linux-4.2/kernel/power/tuxonice_netlink.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_netlink.c 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,324 @@ +/* + * kernel/power/tuxonice_netlink.c + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * Functions for communicating with a userspace helper via netlink. + */ + +#include +#include +#include +#include "tuxonice_netlink.h" +#include "tuxonice.h" +#include "tuxonice_modules.h" +#include "tuxonice_alloc.h" +#include "tuxonice_builtin.h" + +static struct user_helper_data *uhd_list; + +/* + * Refill our pool of SKBs for use in emergencies (eg, when eating memory and + * none can be allocated). + */ +static void toi_fill_skb_pool(struct user_helper_data *uhd) +{ + while (uhd->pool_level < uhd->pool_limit) { + struct sk_buff *new_skb = + alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); + + if (!new_skb) + break; + + new_skb->next = uhd->emerg_skbs; + uhd->emerg_skbs = new_skb; + uhd->pool_level++; + } +} + +/* + * Try to allocate a single skb. If we can't get one, try to use one from + * our pool. + */ +static struct sk_buff *toi_get_skb(struct user_helper_data *uhd) +{ + struct sk_buff *skb = + alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); + + if (skb) + return skb; + + skb = uhd->emerg_skbs; + if (skb) { + uhd->pool_level--; + uhd->emerg_skbs = skb->next; + skb->next = NULL; + } + + return skb; +} + +void toi_send_netlink_message(struct user_helper_data *uhd, + int type, void *params, size_t len) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + void *dest; + struct task_struct *t; + + if (uhd->pid == -1) + return; + + if (uhd->debug) + printk(KERN_ERR "toi_send_netlink_message: Send " + "message type %d.\n", type); + + skb = toi_get_skb(uhd); + if (!skb) { + printk(KERN_INFO "toi_netlink: Can't allocate skb!\n"); + return; + } + + nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0); + uhd->sock_seq++; + + dest = NLMSG_DATA(nlh); + if (params && len > 0) + memcpy(dest, params, len); + + netlink_unicast(uhd->nl, skb, uhd->pid, 0); + + toi_read_lock_tasklist(); + t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); + if (!t) { + toi_read_unlock_tasklist(); + if (uhd->pid > -1) + printk(KERN_INFO "Hmm. Can't find the userspace task" + " %d.\n", uhd->pid); + return; + } + wake_up_process(t); + toi_read_unlock_tasklist(); + + yield(); +} + +static void send_whether_debugging(struct user_helper_data *uhd) +{ + static u8 is_debugging = 1; + + toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING, + &is_debugging, sizeof(u8)); +} + +/* + * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we + * are hibernating. + */ +static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid) +{ + struct task_struct *t; + + if (uhd->debug) + printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid); + + toi_read_lock_tasklist(); + t = find_task_by_pid_ns(pid, &init_pid_ns); + if (!t) { + toi_read_unlock_tasklist(); + printk(KERN_INFO "Strange. Can't find the userspace task %d.\n", + pid); + return -EINVAL; + } + + t->flags |= PF_NOFREEZE; + + toi_read_unlock_tasklist(); + uhd->pid = pid; + + toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0); + + return 0; +} + +/* + * Called when the userspace process has informed us that it's ready to roll. + */ +static int nl_ready(struct user_helper_data *uhd, u32 version) +{ + if (version != uhd->interface_version) { + printk(KERN_INFO "%s userspace process using invalid interface" + " version (%d - kernel wants %d). Trying to " + "continue without it.\n", + uhd->name, version, uhd->interface_version); + if (uhd->not_ready) + uhd->not_ready(); + return -EINVAL; + } + + complete(&uhd->wait_for_process); + + return 0; +} + +void toi_netlink_close_complete(struct user_helper_data *uhd) +{ + if (uhd->nl) { + netlink_kernel_release(uhd->nl); + uhd->nl = NULL; + } + + while (uhd->emerg_skbs) { + struct sk_buff *next = uhd->emerg_skbs->next; + kfree_skb(uhd->emerg_skbs); + uhd->emerg_skbs = next; + } + + uhd->pid = -1; +} + +static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd, + struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int type = nlh->nlmsg_type; + int *data; + int err; + + if (uhd->debug) + printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n", + type); + + /* Let the more specific handler go first. It returns + * 1 for valid messages that it doesn't know. */ + err = uhd->rcv_msg(skb, nlh); + if (err != 1) + return err; + + /* Only allow one task to receive NOFREEZE privileges */ + if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) { + printk(KERN_INFO "Received extra nofreeze me requests.\n"); + return -EBUSY; + } + + data = NLMSG_DATA(nlh); + + switch (type) { + case NETLINK_MSG_NOFREEZE_ME: + return nl_set_nofreeze(uhd, nlh->nlmsg_pid); + case NETLINK_MSG_GET_DEBUGGING: + send_whether_debugging(uhd); + return 0; + case NETLINK_MSG_READY: + if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) { + printk(KERN_INFO "Invalid ready mesage.\n"); + if (uhd->not_ready) + uhd->not_ready(); + return -EINVAL; + } + return nl_ready(uhd, (u32) *data); + case NETLINK_MSG_CLEANUP: + toi_netlink_close_complete(uhd); + return 0; + } + + return -EINVAL; +} + +static void toi_user_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr *nlh; + struct user_helper_data *uhd = uhd_list; + + while (uhd && uhd->netlink_id != skb->sk->sk_protocol) + uhd = uhd->next; + + if (!uhd) + return; + + while (skb->len >= NLMSG_SPACE(0)) { + u32 rlen; + + nlh = (struct nlmsghdr *) skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return; + + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + + err = toi_nl_gen_rcv_msg(uhd, skb, nlh); + if (err) + netlink_ack(skb, nlh, err); + else if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } +} + +static int netlink_prepare(struct user_helper_data *uhd) +{ + struct netlink_kernel_cfg cfg = { + .groups = 0, + .input = toi_user_rcv_skb, + }; + + uhd->next = uhd_list; + uhd_list = uhd; + + uhd->sock_seq = 0x42c0ffee; + uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg); + if (!uhd->nl) { + printk(KERN_INFO "Failed to allocate netlink socket for %s.\n", + uhd->name); + return -ENOMEM; + } + + toi_fill_skb_pool(uhd); + + return 0; +} + +void toi_netlink_close(struct user_helper_data *uhd) +{ + struct task_struct *t; + + toi_read_lock_tasklist(); + t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); + if (t) + t->flags &= ~PF_NOFREEZE; + toi_read_unlock_tasklist(); + + toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0); +} +int toi_netlink_setup(struct user_helper_data *uhd) +{ + /* In case userui didn't cleanup properly on us */ + toi_netlink_close_complete(uhd); + + if (netlink_prepare(uhd) < 0) { + printk(KERN_INFO "Netlink prepare failed.\n"); + return 1; + } + + if (toi_launch_userspace_program(uhd->program, uhd->netlink_id, + UMH_WAIT_EXEC, uhd->debug) < 0) { + printk(KERN_INFO "Launch userspace program failed.\n"); + toi_netlink_close_complete(uhd); + return 1; + } + + /* Wait 2 seconds for the userspace process to make contact */ + wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ); + + if (uhd->pid == -1) { + printk(KERN_INFO "%s: Failed to contact userspace process.\n", + uhd->name); + toi_netlink_close_complete(uhd); + return 1; + } + + return 0; +} diff -Nur linux-4.2/kernel/power/tuxonice_netlink.h linux-4.2-pck/kernel/power/tuxonice_netlink.h --- linux-4.2/kernel/power/tuxonice_netlink.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_netlink.h 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,62 @@ +/* + * kernel/power/tuxonice_netlink.h + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * Declarations for functions for communicating with a userspace helper + * via netlink. + */ + +#include +#include + +#define NETLINK_MSG_BASE 0x10 + +#define NETLINK_MSG_READY 0x10 +#define NETLINK_MSG_NOFREEZE_ME 0x16 +#define NETLINK_MSG_GET_DEBUGGING 0x19 +#define NETLINK_MSG_CLEANUP 0x24 +#define NETLINK_MSG_NOFREEZE_ACK 0x27 +#define NETLINK_MSG_IS_DEBUGGING 0x28 + +struct user_helper_data { + int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh); + void (*not_ready) (void); + struct sock *nl; + u32 sock_seq; + pid_t pid; + char *comm; + char program[256]; + int pool_level; + int pool_limit; + struct sk_buff *emerg_skbs; + int skb_size; + int netlink_id; + char *name; + struct user_helper_data *next; + struct completion wait_for_process; + u32 interface_version; + int must_init; + int debug; +}; + +#ifdef CONFIG_NET +int toi_netlink_setup(struct user_helper_data *uhd); +void toi_netlink_close(struct user_helper_data *uhd); +void toi_send_netlink_message(struct user_helper_data *uhd, + int type, void *params, size_t len); +void toi_netlink_close_complete(struct user_helper_data *uhd); +#else +static inline int toi_netlink_setup(struct user_helper_data *uhd) +{ + return 0; +} + +static inline void toi_netlink_close(struct user_helper_data *uhd) { }; +static inline void toi_send_netlink_message(struct user_helper_data *uhd, + int type, void *params, size_t len) { }; +static inline void toi_netlink_close_complete(struct user_helper_data *uhd) + { }; +#endif diff -Nur linux-4.2/kernel/power/tuxonice_pagedir.c linux-4.2-pck/kernel/power/tuxonice_pagedir.c --- linux-4.2/kernel/power/tuxonice_pagedir.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_pagedir.c 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,345 @@ +/* + * kernel/power/tuxonice_pagedir.c + * + * Copyright (C) 1998-2001 Gabor Kuti + * Copyright (C) 1998,2001,2002 Pavel Machek + * Copyright (C) 2002-2003 Florent Chabaud + * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * Routines for handling pagesets. + * Note that pbes aren't actually stored as such. They're stored as + * bitmaps and extents. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "tuxonice_pageflags.h" +#include "tuxonice_ui.h" +#include "tuxonice_pagedir.h" +#include "tuxonice_prepare_image.h" +#include "tuxonice.h" +#include "tuxonice_builtin.h" +#include "tuxonice_alloc.h" + +static int ptoi_pfn; +static struct pbe *this_low_pbe; +static struct pbe **last_low_pbe_ptr; + +void toi_reset_alt_image_pageset2_pfn(void) +{ + memory_bm_position_reset(pageset2_map); +} + +static struct page *first_conflicting_page; + +/* + * free_conflicting_pages + */ + +static void free_conflicting_pages(void) +{ + while (first_conflicting_page) { + struct page *next = + *((struct page **) kmap(first_conflicting_page)); + kunmap(first_conflicting_page); + toi__free_page(29, first_conflicting_page); + first_conflicting_page = next; + } +} + +/* __toi_get_nonconflicting_page + * + * Description: Gets order zero pages that won't be overwritten + * while copying the original pages. + */ + +struct page *___toi_get_nonconflicting_page(int can_be_highmem) +{ + struct page *page; + gfp_t flags = TOI_ATOMIC_GFP; + if (can_be_highmem) + flags |= __GFP_HIGHMEM; + + + if (test_toi_state(TOI_LOADING_ALT_IMAGE) && + pageset2_map && ptoi_pfn) { + do { + ptoi_pfn = memory_bm_next_pfn(pageset2_map, 0); + if (ptoi_pfn != BM_END_OF_MAP) { + page = pfn_to_page(ptoi_pfn); + if (!PagePageset1(page) && + (can_be_highmem || !PageHighMem(page))) + return page; + } + } while (ptoi_pfn); + } + + do { + page = toi_alloc_page(29, flags | __GFP_ZERO); + if (!page) { + printk(KERN_INFO "Failed to get nonconflicting " + "page.\n"); + return NULL; + } + if (PagePageset1(page)) { + struct page **next = (struct page **) kmap(page); + *next = first_conflicting_page; + first_conflicting_page = page; + kunmap(page); + } + } while (PagePageset1(page)); + + return page; +} + +unsigned long __toi_get_nonconflicting_page(void) +{ + struct page *page = ___toi_get_nonconflicting_page(0); + return page ? (unsigned long) page_address(page) : 0; +} + +static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe, + int highmem) +{ + if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1)) + + 2 * sizeof(struct pbe)) > PAGE_SIZE) { + struct page *new_page = + ___toi_get_nonconflicting_page(highmem); + if (!new_page) + return ERR_PTR(-ENOMEM); + this_pbe = (struct pbe *) kmap(new_page); + memset(this_pbe, 0, PAGE_SIZE); + *page_ptr = new_page; + } else + this_pbe++; + + return this_pbe; +} + +/** + * get_pageset1_load_addresses - generate pbes for conflicting pages + * + * We check here that pagedir & pages it points to won't collide + * with pages where we're going to restore from the loaded pages + * later. + * + * Returns: + * Zero on success, one if couldn't find enough pages (shouldn't + * happen). + **/ +int toi_get_pageset1_load_addresses(void) +{ + int pfn, highallocd = 0, lowallocd = 0; + int low_needed = pagedir1.size - get_highmem_size(pagedir1); + int high_needed = get_highmem_size(pagedir1); + int low_pages_for_highmem = 0; + gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM; + struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL, + *low_pbe_page, *last_low_pbe_page = NULL; + struct pbe **last_high_pbe_ptr = &restore_highmem_pblist, + *this_high_pbe = NULL; + unsigned long orig_low_pfn, orig_high_pfn; + int high_pbes_done = 0, low_pbes_done = 0; + int low_direct = 0, high_direct = 0, result = 0, i; + int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0; + + toi_trace_index++; + + memory_bm_position_reset(pageset1_map); + memory_bm_position_reset(pageset1_copy_map); + + last_low_pbe_ptr = &restore_pblist; + + /* First, allocate pages for the start of our pbe lists. */ + if (high_needed) { + high_pbe_page = ___toi_get_nonconflicting_page(1); + if (!high_pbe_page) { + result = -ENOMEM; + goto out; + } + this_high_pbe = (struct pbe *) kmap(high_pbe_page); + memset(this_high_pbe, 0, PAGE_SIZE); + } + + low_pbe_page = ___toi_get_nonconflicting_page(0); + if (!low_pbe_page) { + result = -ENOMEM; + goto out; + } + this_low_pbe = (struct pbe *) page_address(low_pbe_page); + + /* + * Next, allocate the number of pages we need. + */ + + i = low_needed + high_needed; + + do { + int is_high; + + if (i == low_needed) + flags &= ~__GFP_HIGHMEM; + + page = toi_alloc_page(30, flags); + BUG_ON(!page); + + SetPagePageset1Copy(page); + is_high = PageHighMem(page); + + if (PagePageset1(page)) { + if (is_high) + high_direct++; + else + low_direct++; + } else { + if (is_high) + highallocd++; + else + lowallocd++; + } + } while (--i); + + high_needed -= high_direct; + low_needed -= low_direct; + + /* + * Do we need to use some lowmem pages for the copies of highmem + * pages? + */ + if (high_needed > highallocd) { + low_pages_for_highmem = high_needed - highallocd; + high_needed -= low_pages_for_highmem; + low_needed += low_pages_for_highmem; + } + + /* + * Now generate our pbes (which will be used for the atomic restore), + * and free unneeded pages. + */ + memory_bm_position_reset(pageset1_copy_map); + for (pfn = memory_bm_next_pfn(pageset1_copy_map, 0); pfn != BM_END_OF_MAP; + pfn = memory_bm_next_pfn(pageset1_copy_map, 0)) { + int is_high; + page = pfn_to_page(pfn); + is_high = PageHighMem(page); + + if (PagePageset1(page)) + continue; + + /* Nope. We're going to use this page. Add a pbe. */ + if (is_high || low_pages_for_highmem) { + struct page *orig_page; + high_pbes_done++; + if (!is_high) + low_pages_for_highmem--; + do { + orig_high_pfn = memory_bm_next_pfn(pageset1_map, 0); + BUG_ON(orig_high_pfn == BM_END_OF_MAP); + orig_page = pfn_to_page(orig_high_pfn); + } while (!PageHighMem(orig_page) || + PagePageset1Copy(orig_page)); + + this_high_pbe->orig_address = (void *) orig_high_pfn; + this_high_pbe->address = page; + this_high_pbe->next = NULL; + toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p", + high_page, high_offset, page, orig_high_pfn, orig_page); + if (last_high_pbe_page != high_pbe_page) { + *last_high_pbe_ptr = + (struct pbe *) high_pbe_page; + if (last_high_pbe_page) { + kunmap(last_high_pbe_page); + high_page++; + high_offset = 0; + } else + high_offset++; + last_high_pbe_page = high_pbe_page; + } else { + *last_high_pbe_ptr = this_high_pbe; + high_offset++; + } + last_high_pbe_ptr = &this_high_pbe->next; + this_high_pbe = get_next_pbe(&high_pbe_page, + this_high_pbe, 1); + if (IS_ERR(this_high_pbe)) { + printk(KERN_INFO + "This high pbe is an error.\n"); + return -ENOMEM; + } + } else { + struct page *orig_page; + low_pbes_done++; + do { + orig_low_pfn = memory_bm_next_pfn(pageset1_map, 0); + BUG_ON(orig_low_pfn == BM_END_OF_MAP); + orig_page = pfn_to_page(orig_low_pfn); + } while (PageHighMem(orig_page) || + PagePageset1Copy(orig_page)); + + this_low_pbe->orig_address = page_address(orig_page); + this_low_pbe->address = page_address(page); + this_low_pbe->next = NULL; + toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p", + low_page, low_offset, this_low_pbe->orig_address, + orig_low_pfn, this_low_pbe->address); + TOI_TRACE_DEBUG(orig_low_pfn, "LoadAddresses (%d/%d): %p=>%p", low_page, low_offset, this_low_pbe->orig_address, this_low_pbe->address); + *last_low_pbe_ptr = this_low_pbe; + last_low_pbe_ptr = &this_low_pbe->next; + this_low_pbe = get_next_pbe(&low_pbe_page, + this_low_pbe, 0); + if (low_pbe_page != last_low_pbe_page) { + if (last_low_pbe_page) { + low_page++; + low_offset = 0; + } else { + low_offset++; + } + last_low_pbe_page = low_pbe_page; + } else + low_offset++; + if (IS_ERR(this_low_pbe)) { + printk(KERN_INFO "this_low_pbe is an error.\n"); + return -ENOMEM; + } + } + } + + if (high_pbe_page) + kunmap(high_pbe_page); + + if (last_high_pbe_page != high_pbe_page) { + if (last_high_pbe_page) + kunmap(last_high_pbe_page); + toi__free_page(29, high_pbe_page); + } + + free_conflicting_pages(); + +out: + return result; +} + +int add_boot_kernel_data_pbe(void) +{ + this_low_pbe->address = (char *) __toi_get_nonconflicting_page(); + if (!this_low_pbe->address) { + printk(KERN_INFO "Failed to get bkd atomic restore buffer."); + return -ENOMEM; + } + + toi_bkd.size = sizeof(toi_bkd); + memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd)); + + *last_low_pbe_ptr = this_low_pbe; + this_low_pbe->orig_address = (char *) boot_kernel_data_buffer; + this_low_pbe->next = NULL; + return 0; +} diff -Nur linux-4.2/kernel/power/tuxonice_pagedir.h linux-4.2-pck/kernel/power/tuxonice_pagedir.h --- linux-4.2/kernel/power/tuxonice_pagedir.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_pagedir.h 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,50 @@ +/* + * kernel/power/tuxonice_pagedir.h + * + * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * Declarations for routines for handling pagesets. + */ + +#ifndef KERNEL_POWER_PAGEDIR_H +#define KERNEL_POWER_PAGEDIR_H + +/* Pagedir + * + * Contains the metadata for a set of pages saved in the image. + */ + +struct pagedir { + int id; + unsigned long size; +#ifdef CONFIG_HIGHMEM + unsigned long size_high; +#endif +}; + +#ifdef CONFIG_HIGHMEM +#define get_highmem_size(pagedir) (pagedir.size_high) +#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0) +#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0) +#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high) +#else +#define get_highmem_size(pagedir) (0) +#define set_highmem_size(pagedir, sz) do { } while (0) +#define inc_highmem_size(pagedir) do { } while (0) +#define get_lowmem_size(pagedir) (pagedir.size) +#endif + +extern struct pagedir pagedir1, pagedir2; + +extern void toi_copy_pageset1(void); + +extern int toi_get_pageset1_load_addresses(void); + +extern unsigned long __toi_get_nonconflicting_page(void); +struct page *___toi_get_nonconflicting_page(int can_be_highmem); + +extern void toi_reset_alt_image_pageset2_pfn(void); +extern int add_boot_kernel_data_pbe(void); +#endif diff -Nur linux-4.2/kernel/power/tuxonice_pageflags.c linux-4.2-pck/kernel/power/tuxonice_pageflags.c --- linux-4.2/kernel/power/tuxonice_pageflags.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_pageflags.c 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,18 @@ +/* + * kernel/power/tuxonice_pageflags.c + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * Routines for serialising and relocating pageflags in which we + * store our image metadata. + */ + +#include "tuxonice_pageflags.h" +#include "power.h" + +int toi_pageflags_space_needed(void) +{ + return memory_bm_space_needed(pageset1_map); +} diff -Nur linux-4.2/kernel/power/tuxonice_pageflags.h linux-4.2-pck/kernel/power/tuxonice_pageflags.h --- linux-4.2/kernel/power/tuxonice_pageflags.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_pageflags.h 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,106 @@ +/* + * kernel/power/tuxonice_pageflags.h + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + */ + +#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H +#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H + +struct memory_bitmap; +void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); +void memory_bm_clear(struct memory_bitmap *bm); + +int mem_bm_set_bit_check(struct memory_bitmap *bm, int index, unsigned long pfn); +void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn); +unsigned long memory_bm_next_pfn(struct memory_bitmap *bm, int index); +unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index); +void memory_bm_position_reset(struct memory_bitmap *bm); +void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); +int toi_alloc_bitmap(struct memory_bitmap **bm); +void toi_free_bitmap(struct memory_bitmap **bm); +void memory_bm_clear(struct memory_bitmap *bm); +void memory_bm_clear_bit(struct memory_bitmap *bm, int index, unsigned long pfn); +void memory_bm_set_bit(struct memory_bitmap *bm, int index, unsigned long pfn); +int memory_bm_test_bit(struct memory_bitmap *bm, int index, unsigned long pfn); +int memory_bm_test_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn); +void memory_bm_clear_bit_index(struct memory_bitmap *bm, int index, unsigned long pfn); + +struct toi_module_ops; +int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk) + (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); +int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk) + (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); +int memory_bm_space_needed(struct memory_bitmap *bm); + +extern struct memory_bitmap *pageset1_map; +extern struct memory_bitmap *pageset1_copy_map; +extern struct memory_bitmap *pageset2_map; +extern struct memory_bitmap *page_resave_map; +extern struct memory_bitmap *io_map; +extern struct memory_bitmap *nosave_map; +extern struct memory_bitmap *free_map; +extern struct memory_bitmap *compare_map; + +#define PagePageset1(page) \ + (pageset1_map && memory_bm_test_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) +#define SetPagePageset1(page) \ + (memory_bm_set_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) +#define ClearPagePageset1(page) \ + (memory_bm_clear_bit(pageset1_map, smp_processor_id(), page_to_pfn(page))) + +#define PagePageset1Copy(page) \ + (memory_bm_test_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) +#define SetPagePageset1Copy(page) \ + (memory_bm_set_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) +#define ClearPagePageset1Copy(page) \ + (memory_bm_clear_bit(pageset1_copy_map, smp_processor_id(), page_to_pfn(page))) + +#define PagePageset2(page) \ + (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) +#define SetPagePageset2(page) \ + (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) +#define ClearPagePageset2(page) \ + (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) + +#define PageWasRW(page) \ + (memory_bm_test_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) +#define SetPageWasRW(page) \ + (memory_bm_set_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) +#define ClearPageWasRW(page) \ + (memory_bm_clear_bit(pageset2_map, smp_processor_id(), page_to_pfn(page))) + +#define PageResave(page) (page_resave_map ? \ + memory_bm_test_bit(page_resave_map, smp_processor_id(), page_to_pfn(page)) : 0) +#define SetPageResave(page) \ + (memory_bm_set_bit(page_resave_map, smp_processor_id(), page_to_pfn(page))) +#define ClearPageResave(page) \ + (memory_bm_clear_bit(page_resave_map, smp_processor_id(), page_to_pfn(page))) + +#define PageNosave(page) (nosave_map ? \ + memory_bm_test_bit(nosave_map, smp_processor_id(), page_to_pfn(page)) : 0) +#define SetPageNosave(page) \ + (mem_bm_set_bit_check(nosave_map, smp_processor_id(), page_to_pfn(page))) +#define ClearPageNosave(page) \ + (memory_bm_clear_bit(nosave_map, smp_processor_id(), page_to_pfn(page))) + +#define PageNosaveFree(page) (free_map ? \ + memory_bm_test_bit(free_map, smp_processor_id(), page_to_pfn(page)) : 0) +#define SetPageNosaveFree(page) \ + (memory_bm_set_bit(free_map, smp_processor_id(), page_to_pfn(page))) +#define ClearPageNosaveFree(page) \ + (memory_bm_clear_bit(free_map, smp_processor_id(), page_to_pfn(page))) + +#define PageCompareChanged(page) (compare_map ? \ + memory_bm_test_bit(compare_map, smp_processor_id(), page_to_pfn(page)) : 0) +#define SetPageCompareChanged(page) \ + (memory_bm_set_bit(compare_map, smp_processor_id(), page_to_pfn(page))) +#define ClearPageCompareChanged(page) \ + (memory_bm_clear_bit(compare_map, smp_processor_id(), page_to_pfn(page))) + +extern void save_pageflags(struct memory_bitmap *pagemap); +extern int load_pageflags(struct memory_bitmap *pagemap); +extern int toi_pageflags_space_needed(void); +#endif diff -Nur linux-4.2/kernel/power/tuxonice_power_off.c linux-4.2-pck/kernel/power/tuxonice_power_off.c --- linux-4.2/kernel/power/tuxonice_power_off.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_power_off.c 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,286 @@ +/* + * kernel/power/tuxonice_power_off.c + * + * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * Support for powering down. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "tuxonice.h" +#include "tuxonice_ui.h" +#include "tuxonice_power_off.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_modules.h" +#include "tuxonice_io.h" + +unsigned long toi_poweroff_method; /* 0 - Kernel power off */ + +static int wake_delay; +static char lid_state_file[256], wake_alarm_dir[256]; +static struct file *lid_file, *alarm_file, *epoch_file; +static int post_wake_state = -1; + +static int did_suspend_to_both; + +/* + * __toi_power_down + * Functionality : Powers down or reboots the computer once the image + * has been written to disk. + * Key Assumptions : Able to reboot/power down via code called or that + * the warning emitted if the calls fail will be visible + * to the user (ie printk resumes devices). + */ + +static void __toi_power_down(int method) +{ + int error; + + toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." : + "Powering down."); + + if (test_result_state(TOI_ABORTED)) + goto out; + + if (test_action_state(TOI_REBOOT)) + kernel_restart(NULL); + + switch (method) { + case 0: + break; + case 3: + /* + * Re-read the overwritten part of pageset2 to make post-resume + * faster. + */ + if (read_pageset2(1)) + panic("Attempt to reload pagedir 2 failed. " + "Try rebooting."); + + pm_prepare_console(); + + error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); + if (!error) { + pm_restore_gfp_mask(); + error = suspend_devices_and_enter(PM_SUSPEND_MEM); + pm_restrict_gfp_mask(); + if (!error) + did_suspend_to_both = 1; + } + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); + + /* Success - we're now post-resume-from-ram */ + if (did_suspend_to_both) + return; + + /* Failed to suspend to ram - do normal power off */ + break; + case 4: + /* + * If succeeds, doesn't return. If fails, do a simple + * powerdown. + */ + hibernation_platform_enter(); + break; + case 5: + /* Historic entry only now */ + break; + } + + if (method && method != 5) + toi_cond_pause(1, + "Falling back to alternate power off method."); + + if (test_result_state(TOI_ABORTED)) + goto out; + + if (pm_power_off) + kernel_power_off(); + kernel_halt(); + toi_cond_pause(1, "Powerdown failed."); + while (1) + cpu_relax(); + +out: + if (read_pageset2(1)) + panic("Attempt to reload pagedir 2 failed. Try rebooting."); + return; +} + +#define CLOSE_FILE(file) \ + if (file) { \ + filp_close(file, NULL); file = NULL; \ + } + +static void powerdown_cleanup(int toi_or_resume) +{ + if (!toi_or_resume) + return; + + CLOSE_FILE(lid_file); + CLOSE_FILE(alarm_file); + CLOSE_FILE(epoch_file); +} + +static void open_file(char *format, char *arg, struct file **var, int mode, + char *desc) +{ + char buf[256]; + + if (strlen(arg)) { + sprintf(buf, format, arg); + *var = filp_open(buf, mode, 0); + if (IS_ERR(*var) || !*var) { + printk(KERN_INFO "Failed to open %s file '%s' (%p).\n", + desc, buf, *var); + *var = NULL; + } + } +} + +static int powerdown_init(int toi_or_resume) +{ + if (!toi_or_resume) + return 0; + + did_suspend_to_both = 0; + + open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file, + O_RDONLY, "lid"); + + if (strlen(wake_alarm_dir)) { + open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir, + &alarm_file, O_WRONLY, "alarm"); + + open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir, + &epoch_file, O_RDONLY, "epoch"); + } + + return 0; +} + +static int lid_closed(void) +{ + char array[25]; + ssize_t size; + loff_t pos = 0; + + if (!lid_file) + return 0; + + size = vfs_read(lid_file, (char __user *) array, 25, &pos); + if ((int) size < 1) { + printk(KERN_INFO "Failed to read lid state file (%d).\n", + (int) size); + return 0; + } + + if (!strcmp(array, "state: closed\n")) + return 1; + + return 0; +} + +static void write_alarm_file(int value) +{ + ssize_t size; + char buf[40]; + loff_t pos = 0; + + if (!alarm_file) + return; + + sprintf(buf, "%d\n", value); + + size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos); + + if (size < 0) + printk(KERN_INFO "Error %d writing alarm value %s.\n", + (int) size, buf); +} + +/** + * toi_check_resleep: See whether to powerdown again after waking. + * + * After waking, check whether we should powerdown again in a (usually + * different) way. We only do this if the lid switch is still closed. + */ +void toi_check_resleep(void) +{ + /* We only return if we suspended to ram and woke. */ + if (lid_closed() && post_wake_state >= 0) + __toi_power_down(post_wake_state); +} + +void toi_power_down(void) +{ + if (alarm_file && wake_delay) { + char array[25]; + loff_t pos = 0; + size_t size = vfs_read(epoch_file, (char __user *) array, 25, + &pos); + + if (((int) size) < 1) + printk(KERN_INFO "Failed to read epoch file (%d).\n", + (int) size); + else { + unsigned long since_epoch; + if (!kstrtoul(array, 0, &since_epoch)) { + /* Clear any wakeup time. */ + write_alarm_file(0); + + /* Set new wakeup time. */ + write_alarm_file(since_epoch + wake_delay); + } + } + } + + __toi_power_down(toi_poweroff_method); + + toi_check_resleep(); +} + +static struct toi_sysfs_data sysfs_params[] = { +#if defined(CONFIG_ACPI) + SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL), + SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL), + SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL), + SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0, + NULL), + SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0), + SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both, + 0, 0, 0, NULL) +#endif +}; + +static struct toi_module_ops powerdown_ops = { + .type = MISC_HIDDEN_MODULE, + .name = "poweroff", + .initialise = powerdown_init, + .cleanup = powerdown_cleanup, + .directory = "[ROOT]", + .module = THIS_MODULE, + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +int toi_poweroff_init(void) +{ + return toi_register_module(&powerdown_ops); +} + +void toi_poweroff_exit(void) +{ + toi_unregister_module(&powerdown_ops); +} diff -Nur linux-4.2/kernel/power/tuxonice_power_off.h linux-4.2-pck/kernel/power/tuxonice_power_off.h --- linux-4.2/kernel/power/tuxonice_power_off.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_power_off.h 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,24 @@ +/* + * kernel/power/tuxonice_power_off.h + * + * Copyright (C) 2006-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * Support for the powering down. + */ + +int toi_pm_state_finish(void); +void toi_power_down(void); +extern unsigned long toi_poweroff_method; +int toi_poweroff_init(void); +void toi_poweroff_exit(void); +void toi_check_resleep(void); + +extern int platform_begin(int platform_mode); +extern int platform_pre_snapshot(int platform_mode); +extern void platform_leave(int platform_mode); +extern void platform_end(int platform_mode); +extern void platform_finish(int platform_mode); +extern int platform_pre_restore(int platform_mode); +extern void platform_restore_cleanup(int platform_mode); diff -Nur linux-4.2/kernel/power/tuxonice_prepare_image.c linux-4.2-pck/kernel/power/tuxonice_prepare_image.c --- linux-4.2/kernel/power/tuxonice_prepare_image.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_prepare_image.c 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,1080 @@ +/* + * kernel/power/tuxonice_prepare_image.c + * + * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * We need to eat memory until we can: + * 1. Perform the save without changing anything (RAM_NEEDED < #pages) + * 2. Fit it all in available space (toiActiveAllocator->available_space() >= + * main_storage_needed()) + * 3. Reload the pagedir and pageset1 to places that don't collide with their + * final destinations, not knowing to what extent the resumed kernel will + * overlap with the one loaded at boot time. I think the resumed kernel + * should overlap completely, but I don't want to rely on this as it is + * an unproven assumption. We therefore assume there will be no overlap at + * all (worse case). + * 4. Meet the user's requested limit (if any) on the size of the image. + * The limit is in MB, so pages/256 (assuming 4K pages). + * + */ + +#include +#include +#include +#include +#include +#include + +#include "tuxonice_pageflags.h" +#include "tuxonice_modules.h" +#include "tuxonice_io.h" +#include "tuxonice_ui.h" +#include "tuxonice_prepare_image.h" +#include "tuxonice.h" +#include "tuxonice_extent.h" +#include "tuxonice_checksum.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_alloc.h" +#include "tuxonice_atomic_copy.h" +#include "tuxonice_builtin.h" + +static unsigned long num_nosave, main_storage_allocated, storage_limit, + header_storage_needed; +unsigned long extra_pd1_pages_allowance = + CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE; +long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT; +static int no_ps2_needed; + +struct attention_list { + struct task_struct *task; + struct attention_list *next; +}; + +static struct attention_list *attention_list; + +#define PAGESET1 0 +#define PAGESET2 1 + +void free_attention_list(void) +{ + struct attention_list *last = NULL; + + while (attention_list) { + last = attention_list; + attention_list = attention_list->next; + toi_kfree(6, last, sizeof(*last)); + } +} + +static int build_attention_list(void) +{ + int i, task_count = 0; + struct task_struct *p; + struct attention_list *next; + + /* + * Count all userspace process (with task->mm) marked PF_NOFREEZE. + */ + toi_read_lock_tasklist(); + for_each_process(p) + if ((p->flags & PF_NOFREEZE) || p == current) + task_count++; + toi_read_unlock_tasklist(); + + /* + * Allocate attention list structs. + */ + for (i = 0; i < task_count; i++) { + struct attention_list *this = + toi_kzalloc(6, sizeof(struct attention_list), + TOI_WAIT_GFP); + if (!this) { + printk(KERN_INFO "Failed to allocate slab for " + "attention list.\n"); + free_attention_list(); + return 1; + } + this->next = NULL; + if (attention_list) + this->next = attention_list; + attention_list = this; + } + + next = attention_list; + toi_read_lock_tasklist(); + for_each_process(p) + if ((p->flags & PF_NOFREEZE) || p == current) { + next->task = p; + next = next->next; + } + toi_read_unlock_tasklist(); + return 0; +} + +static void pageset2_full(void) +{ + struct zone *zone; + struct page *page; + unsigned long flags; + int i; + + toi_trace_index++; + + for_each_populated_zone(zone) { + spin_lock_irqsave(&zone->lru_lock, flags); + for_each_lru(i) { + if (!zone_page_state(zone, NR_LRU_BASE + i)) + continue; + + list_for_each_entry(page, &zone->lruvec.lists[i], lru) { + struct address_space *mapping; + + mapping = page_mapping(page); + if (!mapping || !mapping->host || + !(mapping->host->i_flags & S_ATOMIC_COPY)) { + if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { + TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 unmodified."); + } else { + TOI_TRACE_DEBUG(page_to_pfn(page), "_Pageset2 pageset2_full."); + SetPagePageset2(page); + } + } + } + } + spin_unlock_irqrestore(&zone->lru_lock, flags); + } +} + +/* + * toi_mark_task_as_pageset + * Functionality : Marks all the saveable pages belonging to a given process + * as belonging to a particular pageset. + */ + +static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + + mm = t->active_mm; + + if (!mm || !mm->mmap) + return; + + toi_trace_index++; + + if (!irqs_disabled()) + down_read(&mm->mmap_sem); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long posn; + + if (!vma->vm_start || + vma->vm_flags & VM_PFNMAP) + continue; + + for (posn = vma->vm_start; posn < vma->vm_end; + posn += PAGE_SIZE) { + struct page *page = follow_page(vma, posn, 0); + struct address_space *mapping; + + if (!page || !pfn_valid(page_to_pfn(page))) + continue; + + mapping = page_mapping(page); + if (mapping && mapping->host && + mapping->host->i_flags & S_ATOMIC_COPY && pageset2) + continue; + + if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { + TOI_TRACE_DEBUG(page_to_pfn(page), "_Unmodified %d", pageset2 ? 1 : 2); + continue; + } + + if (pageset2) { + TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 1"); + SetPagePageset2(page); + } else { + TOI_TRACE_DEBUG(page_to_pfn(page), "_MarkTaskAsPageset 2"); + ClearPagePageset2(page); + SetPagePageset1(page); + } + } + } + + if (!irqs_disabled()) + up_read(&mm->mmap_sem); +} + +static void mark_tasks(int pageset) +{ + struct task_struct *p; + + toi_read_lock_tasklist(); + for_each_process(p) { + if (!p->mm) + continue; + + if (p->flags & PF_KTHREAD) + continue; + + toi_mark_task_as_pageset(p, pageset); + } + toi_read_unlock_tasklist(); + +} + +/* mark_pages_for_pageset2 + * + * Description: Mark unshared pages in processes not needed for hibernate as + * being able to be written out in a separate pagedir. + * HighMem pages are simply marked as pageset2. They won't be + * needed during hibernate. + */ + +static void toi_mark_pages_for_pageset2(void) +{ + struct attention_list *this = attention_list; + + memory_bm_clear(pageset2_map); + + if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed) + return; + + if (test_action_state(TOI_PAGESET2_FULL)) + pageset2_full(); + else + mark_tasks(PAGESET2); + + /* + * Because the tasks in attention_list are ones related to hibernating, + * we know that they won't go away under us. + */ + + while (this) { + if (!test_result_state(TOI_ABORTED)) + toi_mark_task_as_pageset(this->task, PAGESET1); + this = this->next; + } +} + +/* + * The atomic copy of pageset1 is stored in pageset2 pages. + * But if pageset1 is larger (normally only just after boot), + * we need to allocate extra pages to store the atomic copy. + * The following data struct and functions are used to handle + * the allocation and freeing of that memory. + */ + +static unsigned long extra_pages_allocated; + +struct extras { + struct page *page; + int order; + struct extras *next; +}; + +static struct extras *extras_list; + +/* toi_free_extra_pagedir_memory + * + * Description: Free previously allocated extra pagedir memory. + */ +void toi_free_extra_pagedir_memory(void) +{ + /* Free allocated pages */ + while (extras_list) { + struct extras *this = extras_list; + int i; + + extras_list = this->next; + + for (i = 0; i < (1 << this->order); i++) + ClearPageNosave(this->page + i); + + toi_free_pages(9, this->page, this->order); + toi_kfree(7, this, sizeof(*this)); + } + + extra_pages_allocated = 0; +} + +/* toi_allocate_extra_pagedir_memory + * + * Description: Allocate memory for making the atomic copy of pagedir1 in the + * case where it is bigger than pagedir2. + * Arguments: int num_to_alloc: Number of extra pages needed. + * Result: int. Number of extra pages we now have allocated. + */ +static int toi_allocate_extra_pagedir_memory(int extra_pages_needed) +{ + int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated; + gfp_t flags = TOI_ATOMIC_GFP; + + if (num_to_alloc < 1) + return 0; + + order = fls(num_to_alloc); + if (order >= MAX_ORDER) + order = MAX_ORDER - 1; + + while (num_to_alloc) { + struct page *newpage; + unsigned long virt; + struct extras *extras_entry; + + while ((1 << order) > num_to_alloc) + order--; + + extras_entry = (struct extras *) toi_kzalloc(7, + sizeof(struct extras), TOI_ATOMIC_GFP); + + if (!extras_entry) + return extra_pages_allocated; + + virt = toi_get_free_pages(9, flags, order); + while (!virt && order) { + order--; + virt = toi_get_free_pages(9, flags, order); + } + + if (!virt) { + toi_kfree(7, extras_entry, sizeof(*extras_entry)); + return extra_pages_allocated; + } + + newpage = virt_to_page(virt); + + extras_entry->page = newpage; + extras_entry->order = order; + extras_entry->next = extras_list; + + extras_list = extras_entry; + + for (j = 0; j < (1 << order); j++) { + SetPageNosave(newpage + j); + SetPagePageset1Copy(newpage + j); + } + + extra_pages_allocated += (1 << order); + num_to_alloc -= (1 << order); + } + + return extra_pages_allocated; +} + +/* + * real_nr_free_pages: Count pcp pages for a zone type or all zones + * (-1 for all, otherwise zone_idx() result desired). + */ +unsigned long real_nr_free_pages(unsigned long zone_idx_mask) +{ + struct zone *zone; + int result = 0, cpu; + + /* PCP lists */ + for_each_populated_zone(zone) { + if (!(zone_idx_mask & (1 << zone_idx(zone)))) + continue; + + for_each_online_cpu(cpu) { + struct per_cpu_pageset *pset = + per_cpu_ptr(zone->pageset, cpu); + struct per_cpu_pages *pcp = &pset->pcp; + result += pcp->count; + } + + result += zone_page_state(zone, NR_FREE_PAGES); + } + return result; +} + +/* + * Discover how much extra memory will be required by the drivers + * when they're asked to hibernate. We can then ensure that amount + * of memory is available when we really want it. + */ +static void get_extra_pd1_allowance(void) +{ + unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final; + + toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers."); + + if (toi_go_atomic(PMSG_FREEZE, 1)) + return; + + final = real_nr_free_pages(all_zones_mask); + toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0); + + extra_pd1_pages_allowance = (orig_num_free > final) ? + orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE : + MIN_EXTRA_PAGES_ALLOWANCE; +} + +/* + * Amount of storage needed, possibly taking into account the + * expected compression ratio and possibly also ignoring our + * allowance for extra pages. + */ +static unsigned long main_storage_needed(int use_ecr, + int ignore_extra_pd1_allow) +{ + return (pagedir1.size + pagedir2.size + + (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) * + (use_ecr ? toi_expected_compression_ratio() : 100) / 100; +} + +/* + * Storage needed for the image header, in bytes until the return. + */ +unsigned long get_header_storage_needed(void) +{ + unsigned long bytes = sizeof(struct toi_header) + + toi_header_storage_for_modules() + + toi_pageflags_space_needed() + + fs_info_space_needed(); + + return DIV_ROUND_UP(bytes, PAGE_SIZE); +} + +/* + * When freeing memory, pages from either pageset might be freed. + * + * When seeking to free memory to be able to hibernate, for every ps1 page + * freed, we need 2 less pages for the atomic copy because there is one less + * page to copy and one more page into which data can be copied. + * + * Freeing ps2 pages saves us nothing directly. No more memory is available + * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but + * that's too much work to figure out. + * + * => ps1_to_free functions + * + * Of course if we just want to reduce the image size, because of storage + * limitations or an image size limit either ps will do. + * + * => any_to_free function + */ + +static unsigned long lowpages_usable_for_highmem_copy(void) +{ + unsigned long needed = get_lowmem_size(pagedir1) + + extra_pd1_pages_allowance + MIN_FREE_RAM + + toi_memory_for_modules(0), + available = get_lowmem_size(pagedir2) + + real_nr_free_low_pages() + extra_pages_allocated; + + return available > needed ? available - needed : 0; +} + +static unsigned long highpages_ps1_to_free(void) +{ + unsigned long need = get_highmem_size(pagedir1), + available = get_highmem_size(pagedir2) + + real_nr_free_high_pages() + + lowpages_usable_for_highmem_copy(); + + return need > available ? DIV_ROUND_UP(need - available, 2) : 0; +} + +static unsigned long lowpages_ps1_to_free(void) +{ + unsigned long needed = get_lowmem_size(pagedir1) + + extra_pd1_pages_allowance + MIN_FREE_RAM + + toi_memory_for_modules(0), + available = get_lowmem_size(pagedir2) + + real_nr_free_low_pages() + extra_pages_allocated; + + return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0; +} + +static unsigned long current_image_size(void) +{ + return pagedir1.size + pagedir2.size + header_storage_needed; +} + +static unsigned long storage_still_required(void) +{ + unsigned long needed = main_storage_needed(1, 1); + return needed > storage_limit ? needed - storage_limit : 0; +} + +static unsigned long ram_still_required(void) +{ + unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) + + 2 * extra_pd1_pages_allowance, + available = real_nr_free_low_pages() + extra_pages_allocated; + return needed > available ? needed - available : 0; +} + +unsigned long any_to_free(int use_image_size_limit) +{ + int use_soft_limit = use_image_size_limit && image_size_limit > 0; + unsigned long current_size = current_image_size(), + soft_limit = use_soft_limit ? (image_size_limit << 8) : 0, + to_free = use_soft_limit ? (current_size > soft_limit ? + current_size - soft_limit : 0) : 0, + storage_limit = storage_still_required(), + ram_limit = ram_still_required(), + first_max = max(to_free, storage_limit); + + return max(first_max, ram_limit); +} + +static int need_pageset2(void) +{ + return (real_nr_free_low_pages() + extra_pages_allocated - + 2 * extra_pd1_pages_allowance - MIN_FREE_RAM - + toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size; +} + +/* amount_needed + * + * Calculates the amount by which the image size needs to be reduced to meet + * our constraints. + */ +static unsigned long amount_needed(int use_image_size_limit) +{ + return max(highpages_ps1_to_free() + lowpages_ps1_to_free(), + any_to_free(use_image_size_limit)); +} + +static int image_not_ready(int use_image_size_limit) +{ + toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, + "Amount still needed (%lu) > 0:%u," + " Storage allocd: %lu < %lu: %u.\n", + amount_needed(use_image_size_limit), + (amount_needed(use_image_size_limit) > 0), + main_storage_allocated, + main_storage_needed(1, 1), + main_storage_allocated < main_storage_needed(1, 1)); + + toi_cond_pause(0, NULL); + + return (amount_needed(use_image_size_limit) > 0) || + main_storage_allocated < main_storage_needed(1, 1); +} + +static void display_failure_reason(int tries_exceeded) +{ + unsigned long storage_required = storage_still_required(), + ram_required = ram_still_required(), + high_ps1 = highpages_ps1_to_free(), + low_ps1 = lowpages_ps1_to_free(); + + printk(KERN_INFO "Failed to prepare the image because...\n"); + + if (!storage_limit) { + printk(KERN_INFO "- You need some storage available to be " + "able to hibernate.\n"); + return; + } + + if (tries_exceeded) + printk(KERN_INFO "- The maximum number of iterations was " + "reached without successfully preparing the " + "image.\n"); + + if (storage_required) { + printk(KERN_INFO " - We need at least %lu pages of storage " + "(ignoring the header), but only have %lu.\n", + main_storage_needed(1, 1), + main_storage_allocated); + set_abort_result(TOI_INSUFFICIENT_STORAGE); + } + + if (ram_required) { + printk(KERN_INFO " - We need %lu more free pages of low " + "memory.\n", ram_required); + printk(KERN_INFO " Minimum free : %8d\n", MIN_FREE_RAM); + printk(KERN_INFO " + Reqd. by modules : %8lu\n", + toi_memory_for_modules(0)); + printk(KERN_INFO " + 2 * extra allow : %8lu\n", + 2 * extra_pd1_pages_allowance); + printk(KERN_INFO " - Currently free : %8lu\n", + real_nr_free_low_pages()); + printk(KERN_INFO " - Pages allocd : %8lu\n", + extra_pages_allocated); + printk(KERN_INFO " : ========\n"); + printk(KERN_INFO " Still needed : %8lu\n", + ram_required); + + /* Print breakdown of memory needed for modules */ + toi_memory_for_modules(1); + set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); + } + + if (high_ps1) { + printk(KERN_INFO "- We need to free %lu highmem pageset 1 " + "pages.\n", high_ps1); + set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); + } + + if (low_ps1) { + printk(KERN_INFO " - We need to free %ld lowmem pageset 1 " + "pages.\n", low_ps1); + set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); + } +} + +static void display_stats(int always, int sub_extra_pd1_allow) +{ + char buffer[255]; + snprintf(buffer, 254, + "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). " + "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). " + "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n", + + /* Free */ + real_nr_free_pages(all_zones_mask), + real_nr_free_low_pages(), + + /* Sets */ + pagedir1.size, pagedir1.size - get_highmem_size(pagedir1), + pagedir2.size, pagedir2.size - get_highmem_size(pagedir2), + + /* Nosave */ + num_nosave, extra_pages_allocated, + num_nosave - extra_pages_allocated, + + /* Storage */ + main_storage_allocated, + storage_limit, + main_storage_needed(1, sub_extra_pd1_allow), + main_storage_needed(1, 1), + + /* Needed */ + lowpages_ps1_to_free(), highpages_ps1_to_free(), + any_to_free(1), + MIN_FREE_RAM, toi_memory_for_modules(0), + extra_pd1_pages_allowance, + image_size_limit, + + need_pageset2() ? "yes" : "no"); + + if (always) + printk("%s", buffer); + else + toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer); +} + +/* flag_image_pages + * + * This routine generates our lists of pages to be stored in each + * pageset. Since we store the data using extents, and adding new + * extents might allocate a new extent page, this routine may well + * be called more than once. + */ +static void flag_image_pages(int atomic_copy) +{ + int num_free = 0, num_unmodified = 0; + unsigned long loop; + struct zone *zone; + + pagedir1.size = 0; + pagedir2.size = 0; + + set_highmem_size(pagedir1, 0); + set_highmem_size(pagedir2, 0); + + num_nosave = 0; + toi_trace_index++; + + memory_bm_clear(pageset1_map); + + toi_generate_free_page_map(); + + /* + * Pages not to be saved are marked Nosave irrespective of being + * reserved. + */ + for_each_populated_zone(zone) { + int highmem = is_highmem(zone); + + for (loop = 0; loop < zone->spanned_pages; loop++) { + unsigned long pfn = zone->zone_start_pfn + loop; + struct page *page; + int chunk_size; + + if (!pfn_valid(pfn)) { + TOI_TRACE_DEBUG(pfn, "_Flag Invalid"); + continue; + } + + chunk_size = toi_size_of_free_region(zone, pfn); + if (chunk_size) { + unsigned long y; + for (y = pfn; y < pfn + chunk_size; y++) { + page = pfn_to_page(y); + TOI_TRACE_DEBUG(y, "_Flag Free"); + ClearPagePageset1(page); + ClearPagePageset2(page); + } + num_free += chunk_size; + loop += chunk_size - 1; + continue; + } + + page = pfn_to_page(pfn); + + if (PageNosave(page)) { + char *desc = PagePageset1Copy(page) ? "Pageset1Copy" : "NoSave"; + TOI_TRACE_DEBUG(pfn, "_Flag %s", desc); + num_nosave++; + continue; + } + + page = highmem ? saveable_highmem_page(zone, pfn) : + saveable_page(zone, pfn); + + if (!page) { + TOI_TRACE_DEBUG(pfn, "_Flag Nosave2"); + num_nosave++; + continue; + } + + if (PageTOI_RO(page) && test_result_state(TOI_KEPT_IMAGE)) { + TOI_TRACE_DEBUG(pfn, "_Unmodified"); + num_unmodified++; + continue; + } + + if (PagePageset2(page)) { + pagedir2.size++; + TOI_TRACE_DEBUG(pfn, "_Flag PS2"); + if (PageHighMem(page)) + inc_highmem_size(pagedir2); + else + SetPagePageset1Copy(page); + if (PageResave(page)) { + SetPagePageset1(page); + ClearPagePageset1Copy(page); + pagedir1.size++; + if (PageHighMem(page)) + inc_highmem_size(pagedir1); + } + } else { + pagedir1.size++; + TOI_TRACE_DEBUG(pfn, "_Flag PS1"); + SetPagePageset1(page); + if (PageHighMem(page)) + inc_highmem_size(pagedir1); + } + } + } + + if (!atomic_copy) + toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0, + "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)" + " + Unmodified (%d) + NumFree (%d) = %d.\n", + pagedir1.size, pagedir2.size, num_nosave, num_unmodified, + num_free, pagedir1.size + pagedir2.size + num_nosave + num_free); +} + +void toi_recalculate_image_contents(int atomic_copy) +{ + memory_bm_clear(pageset1_map); + if (!atomic_copy) { + unsigned long pfn; + memory_bm_position_reset(pageset2_map); + for (pfn = memory_bm_next_pfn(pageset2_map, 0); + pfn != BM_END_OF_MAP; + pfn = memory_bm_next_pfn(pageset2_map, 0)) + ClearPagePageset1Copy(pfn_to_page(pfn)); + /* Need to call this before getting pageset1_size! */ + toi_mark_pages_for_pageset2(); + } + memory_bm_position_reset(pageset2_map); + flag_image_pages(atomic_copy); + + if (!atomic_copy) { + storage_limit = toiActiveAllocator->storage_available(); + display_stats(0, 0); + } +} + +int try_allocate_extra_memory(void) +{ + unsigned long wanted = pagedir1.size + extra_pd1_pages_allowance - + get_lowmem_size(pagedir2); + if (wanted > extra_pages_allocated) { + unsigned long got = toi_allocate_extra_pagedir_memory(wanted); + if (wanted < got) { + toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, + "Want %d extra pages for pageset1, got %d.\n", + wanted, got); + return 1; + } + } + return 0; +} + +/* update_image + * + * Allocate [more] memory and storage for the image. + */ +static void update_image(int ps2_recalc) +{ + int old_header_req; + unsigned long seek; + + if (try_allocate_extra_memory()) + return; + + if (ps2_recalc) + goto recalc; + + thaw_kernel_threads(); + + /* + * Allocate remaining storage space, if possible, up to the + * maximum we know we'll need. It's okay to allocate the + * maximum if the writer is the swapwriter, but + * we don't want to grab all available space on an NFS share. + * We therefore ignore the expected compression ratio here, + * thereby trying to allocate the maximum image size we could + * need (assuming compression doesn't expand the image), but + * don't complain if we can't get the full amount we're after. + */ + + do { + int result; + + old_header_req = header_storage_needed; + toiActiveAllocator->reserve_header_space(header_storage_needed); + + /* How much storage is free with the reservation applied? */ + storage_limit = toiActiveAllocator->storage_available(); + seek = min(storage_limit, main_storage_needed(0, 0)); + + result = toiActiveAllocator->allocate_storage(seek); + if (result) + printk("Failed to allocate storage (%d).\n", result); + + main_storage_allocated = + toiActiveAllocator->storage_allocated(); + + /* Need more header because more storage allocated? */ + header_storage_needed = get_header_storage_needed(); + + } while (header_storage_needed > old_header_req); + + if (freeze_kernel_threads()) + set_abort_result(TOI_FREEZING_FAILED); + +recalc: + toi_recalculate_image_contents(0); +} + +/* attempt_to_freeze + * + * Try to freeze processes. + */ + +static int attempt_to_freeze(void) +{ + int result; + + /* Stop processes before checking again */ + toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing " + "filesystems."); + result = freeze_processes(); + + if (result) + set_abort_result(TOI_FREEZING_FAILED); + + result = freeze_kernel_threads(); + + if (result) + set_abort_result(TOI_FREEZING_FAILED); + + return result; +} + +/* eat_memory + * + * Try to free some memory, either to meet hard or soft constraints on the image + * characteristics. + * + * Hard constraints: + * - Pageset1 must be < half of memory; + * - We must have enough memory free at resume time to have pageset1 + * be able to be loaded in pages that don't conflict with where it has to + * be restored. + * Soft constraints + * - User specificied image size limit. + */ +static void eat_memory(void) +{ + unsigned long amount_wanted = 0; + int did_eat_memory = 0; + + /* + * Note that if we have enough storage space and enough free memory, we + * may exit without eating anything. We give up when the last 10 + * iterations ate no extra pages because we're not going to get much + * more anyway, but the few pages we get will take a lot of time. + * + * We freeze processes before beginning, and then unfreeze them if we + * need to eat memory until we think we have enough. If our attempts + * to freeze fail, we give up and abort. + */ + + amount_wanted = amount_needed(1); + + switch (image_size_limit) { + case -1: /* Don't eat any memory */ + if (amount_wanted > 0) { + set_abort_result(TOI_WOULD_EAT_MEMORY); + return; + } + break; + case -2: /* Free caches only */ + drop_pagecache(); + toi_recalculate_image_contents(0); + amount_wanted = amount_needed(1); + break; + default: + break; + } + + if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) && + image_size_limit != -1) { + unsigned long request = amount_wanted; + unsigned long high_req = max(highpages_ps1_to_free(), + any_to_free(1)); + unsigned long low_req = lowpages_ps1_to_free(); + unsigned long got = 0; + + toi_prepare_status(CLEAR_BAR, + "Seeking to free %ldMB of memory.", + MB(amount_wanted)); + + thaw_kernel_threads(); + + /* + * Ask for too many because shrink_memory_mask doesn't + * currently return enough most of the time. + */ + + if (low_req) + got = shrink_memory_mask(low_req, GFP_KERNEL); + if (high_req) + shrink_memory_mask(high_req - got, GFP_HIGHUSER); + + did_eat_memory = 1; + + toi_recalculate_image_contents(0); + + amount_wanted = amount_needed(1); + + printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &" + " %ld pages from anywhere, got %ld.\n", + high_req, low_req, + request - amount_wanted); + + toi_cond_pause(0, NULL); + + if (freeze_kernel_threads()) + set_abort_result(TOI_FREEZING_FAILED); + } + + if (did_eat_memory) + toi_recalculate_image_contents(0); +} + +/* toi_prepare_image + * + * Entry point to the whole image preparation section. + * + * We do four things: + * - Freeze processes; + * - Ensure image size constraints are met; + * - Complete all the preparation for saving the image, + * including allocation of storage. The only memory + * that should be needed when we're finished is that + * for actually storing the image (and we know how + * much is needed for that because the modules tell + * us). + * - Make sure that all dirty buffers are written out. + */ +#define MAX_TRIES 2 +int toi_prepare_image(void) +{ + int result = 1, tries = 1; + + main_storage_allocated = 0; + no_ps2_needed = 0; + + if (attempt_to_freeze()) + return 1; + + lock_device_hotplug(); + set_toi_state(TOI_DEVICE_HOTPLUG_LOCKED); + + if (!extra_pd1_pages_allowance) + get_extra_pd1_allowance(); + + storage_limit = toiActiveAllocator->storage_available(); + + if (!storage_limit) { + printk(KERN_INFO "No storage available. Didn't try to prepare " + "an image.\n"); + display_failure_reason(0); + set_abort_result(TOI_NOSTORAGE_AVAILABLE); + return 1; + } + + if (build_attention_list()) { + abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, + "Unable to successfully prepare the image.\n"); + return 1; + } + + toi_recalculate_image_contents(0); + + do { + toi_prepare_status(CLEAR_BAR, + "Preparing Image. Try %d.", tries); + + eat_memory(); + + if (test_result_state(TOI_ABORTED)) + break; + + update_image(0); + + tries++; + + } while (image_not_ready(1) && tries <= MAX_TRIES && + !test_result_state(TOI_ABORTED)); + + result = image_not_ready(0); + + /* TODO: Handle case where need to remove existing image and resave + * instead of adding to incremental image. */ + + if (!test_result_state(TOI_ABORTED)) { + if (result) { + display_stats(1, 0); + display_failure_reason(tries > MAX_TRIES); + abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, + "Unable to successfully prepare the image.\n"); + } else { + /* Pageset 2 needed? */ + if (!need_pageset2() && + test_action_state(TOI_NO_PS2_IF_UNNEEDED)) { + no_ps2_needed = 1; + toi_recalculate_image_contents(0); + update_image(1); + } + + toi_cond_pause(1, "Image preparation complete."); + } + } + + return result ? result : allocate_checksum_pages(); +} diff -Nur linux-4.2/kernel/power/tuxonice_prepare_image.h linux-4.2-pck/kernel/power/tuxonice_prepare_image.h --- linux-4.2/kernel/power/tuxonice_prepare_image.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_prepare_image.h 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,38 @@ +/* + * kernel/power/tuxonice_prepare_image.h + * + * Copyright (C) 2003-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + */ + +#include + +extern int toi_prepare_image(void); +extern void toi_recalculate_image_contents(int storage_available); +extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask); +extern long image_size_limit; +extern void toi_free_extra_pagedir_memory(void); +extern unsigned long extra_pd1_pages_allowance; +extern void free_attention_list(void); + +#define MIN_FREE_RAM 100 +#define MIN_EXTRA_PAGES_ALLOWANCE 500 + +#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1)) +#ifdef CONFIG_HIGHMEM +#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM)) +#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \ + (1 << ZONE_HIGHMEM))) +#else +#define real_nr_free_high_pages() (0) +#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask)) + +/* For eat_memory function */ +#define ZONE_HIGHMEM (MAX_NR_ZONES + 1) +#endif + +unsigned long get_header_storage_needed(void); +unsigned long any_to_free(int use_image_size_limit); +int try_allocate_extra_memory(void); diff -Nur linux-4.2/kernel/power/tuxonice_prune.c linux-4.2-pck/kernel/power/tuxonice_prune.c --- linux-4.2/kernel/power/tuxonice_prune.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_prune.c 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,406 @@ +/* + * kernel/power/tuxonice_prune.c + * + * Copyright (C) 2012 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * This file implements a TuxOnIce module that seeks to prune the + * amount of data written to disk. It builds a table of hashes + * of the uncompressed data, and writes the pfn of the previous page + * with the same contents instead of repeating the data when a match + * is found. + */ + +#include +#include +#include +#include +#include +#include + +#include "tuxonice_builtin.h" +#include "tuxonice.h" +#include "tuxonice_modules.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_io.h" +#include "tuxonice_ui.h" +#include "tuxonice_alloc.h" + +/* + * We never write a page bigger than PAGE_SIZE, so use a large number + * to indicate that data is a PFN. + */ +#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100) + +static unsigned long toi_pruned_pages; + +static struct toi_module_ops toi_prune_ops; +static struct toi_module_ops *next_driver; + +static char toi_prune_hash_algo_name[32] = "sha1"; + +static DEFINE_MUTEX(stats_lock); + +struct cpu_context { + struct shash_desc desc; + char *digest; +}; + +#define OUT_BUF_SIZE (2 * PAGE_SIZE) + +static DEFINE_PER_CPU(struct cpu_context, contexts); + +/* + * toi_crypto_prepare + * + * Prepare to do some work by allocating buffers and transforms. + */ +static int toi_prune_crypto_prepare(void) +{ + int cpu, ret, digestsize; + + if (!*toi_prune_hash_algo_name) { + printk(KERN_INFO "TuxOnIce: Pruning enabled but no " + "hash algorithm set.\n"); + return 1; + } + + for_each_online_cpu(cpu) { + struct cpu_context *this = &per_cpu(contexts, cpu); + this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0); + if (IS_ERR(this->desc.tfm)) { + printk(KERN_INFO "TuxOnIce: Failed to allocate the " + "%s prune hash algorithm.\n", + toi_prune_hash_algo_name); + this->desc.tfm = NULL; + return 1; + } + + if (!digestsize) + digestsize = crypto_shash_digestsize(this->desc.tfm); + + this->digest = kmalloc(digestsize, GFP_KERNEL); + if (!this->digest) { + printk(KERN_INFO "TuxOnIce: Failed to allocate space " + "for digest output.\n"); + crypto_free_shash(this->desc.tfm); + this->desc.tfm = NULL; + } + + this->desc.flags = 0; + + ret = crypto_shash_init(&this->desc); + if (ret < 0) { + printk(KERN_INFO "TuxOnIce: Failed to initialise the " + "%s prune hash algorithm.\n", + toi_prune_hash_algo_name); + kfree(this->digest); + this->digest = NULL; + crypto_free_shash(this->desc.tfm); + this->desc.tfm = NULL; + return 1; + } + } + + return 0; +} + +static int toi_prune_rw_cleanup(int writing) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct cpu_context *this = &per_cpu(contexts, cpu); + if (this->desc.tfm) { + crypto_free_shash(this->desc.tfm); + this->desc.tfm = NULL; + } + + if (this->digest) { + kfree(this->digest); + this->digest = NULL; + } + } + + return 0; +} + +/* + * toi_prune_init + */ + +static int toi_prune_init(int toi_or_resume) +{ + if (!toi_or_resume) + return 0; + + toi_pruned_pages = 0; + + next_driver = toi_get_next_filter(&toi_prune_ops); + + return next_driver ? 0 : -ECHILD; +} + +/* + * toi_prune_rw_init() + */ + +static int toi_prune_rw_init(int rw, int stream_number) +{ + if (toi_prune_crypto_prepare()) { + printk(KERN_ERR "Failed to initialise prune " + "algorithm.\n"); + if (rw == READ) { + printk(KERN_INFO "Unable to read the image.\n"); + return -ENODEV; + } else { + printk(KERN_INFO "Continuing without " + "pruning the image.\n"); + toi_prune_ops.enabled = 0; + } + } + + return 0; +} + +/* + * toi_prune_write_page() + * + * Compress a page of data, buffering output and passing on filled + * pages to the next module in the pipeline. + * + * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing + * data to be checked. + * + * Returns: 0 on success. Otherwise the error is that returned by later + * modules, -ECHILD if we have a broken pipeline or -EIO if + * zlib errs. + */ +static int toi_prune_write_page(unsigned long index, int buf_type, + void *buffer_page, unsigned int buf_size) +{ + int ret = 0, cpu = smp_processor_id(), write_data = 1; + struct cpu_context *ctx = &per_cpu(contexts, cpu); + u8* output_buffer = buffer_page; + int output_len = buf_size; + int out_buf_type = buf_type; + void *buffer_start; + u32 buf[4]; + + if (ctx->desc.tfm) { + + buffer_start = TOI_MAP(buf_type, buffer_page); + ctx->len = OUT_BUF_SIZE; + + ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest); + if (ret) { + printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret); + } else { + mutex_lock(&stats_lock); + + toi_pruned_pages++; + + mutex_unlock(&stats_lock); + + } + + TOI_UNMAP(buf_type, buffer_page); + } + + if (write_data) + ret = next_driver->write_page(index, out_buf_type, + output_buffer, output_len); + else + ret = next_driver->write_page(index, out_buf_type, + output_buffer, output_len); + + return ret; +} + +/* + * toi_prune_read_page() + * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. + * + * Retrieve data from later modules or from a previously loaded page and + * fill the input buffer. + * Zero if successful. Error condition from me or from downstream on failure. + */ +static int toi_prune_read_page(unsigned long *index, int buf_type, + void *buffer_page, unsigned int *buf_size) +{ + int ret, cpu = smp_processor_id(); + unsigned int len; + char *buffer_start; + struct cpu_context *ctx = &per_cpu(contexts, cpu); + + if (!ctx->desc.tfm) + return next_driver->read_page(index, TOI_PAGE, buffer_page, + buf_size); + + /* + * All our reads must be synchronous - we can't handle + * data that hasn't been read yet. + */ + + ret = next_driver->read_page(index, buf_type, buffer_page, &len); + + if (len == PRUNE_DATA_IS_PFN) { + buffer_start = kmap(buffer_page); + } + + return ret; +} + +/* + * toi_prune_print_debug_stats + * @buffer: Pointer to a buffer into which the debug info will be printed. + * @size: Size of the buffer. + * + * Print information to be recorded for debugging purposes into a buffer. + * Returns: Number of characters written to the buffer. + */ + +static int toi_prune_print_debug_stats(char *buffer, int size) +{ + int len; + + /* Output the number of pages pruned. */ + if (*toi_prune_hash_algo_name) + len = scnprintf(buffer, size, "- Compressor is '%s'.\n", + toi_prune_hash_algo_name); + else + len = scnprintf(buffer, size, "- Compressor is not set.\n"); + + if (toi_pruned_pages) + len += scnprintf(buffer+len, size - len, " Pruned " + "%lu pages).\n", + toi_pruned_pages); + return len; +} + +/* + * toi_prune_memory_needed + * + * Tell the caller how much memory we need to operate during hibernate/resume. + * Returns: Unsigned long. Maximum number of bytes of memory required for + * operation. + */ +static int toi_prune_memory_needed(void) +{ + return 2 * PAGE_SIZE; +} + +static int toi_prune_storage_needed(void) +{ + return 2 * sizeof(unsigned long) + 2 * sizeof(int) + + strlen(toi_prune_hash_algo_name) + 1; +} + +/* + * toi_prune_save_config_info + * @buffer: Pointer to a buffer of size PAGE_SIZE. + * + * Save informaton needed when reloading the image at resume time. + * Returns: Number of bytes used for saving our data. + */ +static int toi_prune_save_config_info(char *buffer) +{ + int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0; + + *((unsigned long *) buffer) = toi_pruned_pages; + offset += sizeof(unsigned long); + *((int *) (buffer + offset)) = len; + offset += sizeof(int); + strncpy(buffer + offset, toi_prune_hash_algo_name, len); + return offset + len; +} + +/* toi_prune_load_config_info + * @buffer: Pointer to the start of the data. + * @size: Number of bytes that were saved. + * + * Description: Reload information needed for passing back to the + * resumed kernel. + */ +static void toi_prune_load_config_info(char *buffer, int size) +{ + int len, offset = 0; + + toi_pruned_pages = *((unsigned long *) buffer); + offset += sizeof(unsigned long); + len = *((int *) (buffer + offset)); + offset += sizeof(int); + strncpy(toi_prune_hash_algo_name, buffer + offset, len); +} + +static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd) +{ + bkd->pruned_pages = toi_pruned_pages; +} + +static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd) +{ + toi_pruned_pages = bkd->pruned_pages; +} + +/* + * toi_expected_ratio + * + * Description: Returns the expected ratio between data passed into this module + * and the amount of data output when writing. + * Returns: 100 - we have no idea how many pages will be pruned. + */ + +static int toi_prune_expected_ratio(void) +{ + return 100; +} + +/* + * data for our sysfs entries. + */ +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0, + NULL), + SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL), +}; + +/* + * Ops structure. + */ +static struct toi_module_ops toi_prune_ops = { + .type = FILTER_MODULE, + .name = "prune", + .directory = "prune", + .module = THIS_MODULE, + .initialise = toi_prune_init, + .memory_needed = toi_prune_memory_needed, + .print_debug_info = toi_prune_print_debug_stats, + .save_config_info = toi_prune_save_config_info, + .load_config_info = toi_prune_load_config_info, + .storage_needed = toi_prune_storage_needed, + .expected_compression = toi_prune_expected_ratio, + + .pre_atomic_restore = toi_prune_pre_atomic_restore, + .post_atomic_restore = toi_prune_post_atomic_restore, + + .rw_init = toi_prune_rw_init, + .rw_cleanup = toi_prune_rw_cleanup, + + .write_page = toi_prune_write_page, + .read_page = toi_prune_read_page, + + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +/* ---- Registration ---- */ + +static __init int toi_prune_load(void) +{ + return toi_register_module(&toi_prune_ops); +} + +late_initcall(toi_prune_load); diff -Nur linux-4.2/kernel/power/tuxonice_storage.c linux-4.2-pck/kernel/power/tuxonice_storage.c --- linux-4.2/kernel/power/tuxonice_storage.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_storage.c 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,282 @@ +/* + * kernel/power/tuxonice_storage.c + * + * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * Routines for talking to a userspace program that manages storage. + * + * The kernel side: + * - starts the userspace program; + * - sends messages telling it when to open and close the connection; + * - tells it when to quit; + * + * The user space side: + * - passes messages regarding status; + * + */ + +#include +#include + +#include "tuxonice_sysfs.h" +#include "tuxonice_modules.h" +#include "tuxonice_netlink.h" +#include "tuxonice_storage.h" +#include "tuxonice_ui.h" + +static struct user_helper_data usm_helper_data; +static struct toi_module_ops usm_ops; +static int message_received, usm_prepare_count; +static int storage_manager_last_action, storage_manager_action; + +static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int type; + int *data; + + type = nlh->nlmsg_type; + + /* A control message: ignore them */ + if (type < NETLINK_MSG_BASE) + return 0; + + /* Unknown message: reply with EINVAL */ + if (type >= USM_MSG_MAX) + return -EINVAL; + + /* All operations require privileges, even GET */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* Only allow one task to receive NOFREEZE privileges */ + if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1) + return -EBUSY; + + data = (int *) NLMSG_DATA(nlh); + + switch (type) { + case USM_MSG_SUCCESS: + case USM_MSG_FAILED: + message_received = type; + complete(&usm_helper_data.wait_for_process); + break; + default: + printk(KERN_INFO "Storage manager doesn't recognise " + "message %d.\n", type); + } + + return 1; +} + +#ifdef CONFIG_NET +static int activations; + +int toi_activate_storage(int force) +{ + int tries = 1; + + if (usm_helper_data.pid == -1 || !usm_ops.enabled) + return 0; + + message_received = 0; + activations++; + + if (activations > 1 && !force) + return 0; + + while ((!message_received || message_received == USM_MSG_FAILED) && + tries < 2) { + toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt " + "%d.\n", tries); + + init_completion(&usm_helper_data.wait_for_process); + + toi_send_netlink_message(&usm_helper_data, + USM_MSG_CONNECT, + NULL, 0); + + /* Wait 2 seconds for the userspace process to make contact */ + wait_for_completion_timeout(&usm_helper_data.wait_for_process, + 2*HZ); + + tries++; + } + + return 0; +} + +int toi_deactivate_storage(int force) +{ + if (usm_helper_data.pid == -1 || !usm_ops.enabled) + return 0; + + message_received = 0; + activations--; + + if (activations && !force) + return 0; + + init_completion(&usm_helper_data.wait_for_process); + + toi_send_netlink_message(&usm_helper_data, + USM_MSG_DISCONNECT, + NULL, 0); + + wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ); + + if (!message_received || message_received == USM_MSG_FAILED) { + printk(KERN_INFO "Returning failure disconnecting storage.\n"); + return 1; + } + + return 0; +} +#endif + +static void storage_manager_simulate(void) +{ + printk(KERN_INFO "--- Storage manager simulate ---\n"); + toi_prepare_usm(); + schedule(); + printk(KERN_INFO "--- Activate storage 1 ---\n"); + toi_activate_storage(1); + schedule(); + printk(KERN_INFO "--- Deactivate storage 1 ---\n"); + toi_deactivate_storage(1); + schedule(); + printk(KERN_INFO "--- Cleanup usm ---\n"); + toi_cleanup_usm(); + schedule(); + printk(KERN_INFO "--- Storage manager simulate ends ---\n"); +} + +static int usm_storage_needed(void) +{ + return sizeof(int) + strlen(usm_helper_data.program) + 1; +} + +static int usm_save_config_info(char *buf) +{ + int len = strlen(usm_helper_data.program); + memcpy(buf, usm_helper_data.program, len + 1); + return sizeof(int) + len + 1; +} + +static void usm_load_config_info(char *buf, int size) +{ + /* Don't load the saved path if one has already been set */ + if (usm_helper_data.program[0]) + return; + + memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf)); +} + +static int usm_memory_needed(void) +{ + /* ball park figure of 32 pages */ + return 32 * PAGE_SIZE; +} + +/* toi_prepare_usm + */ +int toi_prepare_usm(void) +{ + usm_prepare_count++; + + if (usm_prepare_count > 1 || !usm_ops.enabled) + return 0; + + usm_helper_data.pid = -1; + + if (!*usm_helper_data.program) + return 0; + + toi_netlink_setup(&usm_helper_data); + + if (usm_helper_data.pid == -1) + printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't" + " start it.\n"); + + toi_activate_storage(0); + + return usm_helper_data.pid != -1; +} + +void toi_cleanup_usm(void) +{ + usm_prepare_count--; + + if (usm_helper_data.pid > -1 && !usm_prepare_count) { + toi_deactivate_storage(0); + toi_netlink_close(&usm_helper_data); + } +} + +static void storage_manager_activate(void) +{ + if (storage_manager_action == storage_manager_last_action) + return; + + if (storage_manager_action) + toi_prepare_usm(); + else + toi_cleanup_usm(); + + storage_manager_last_action = storage_manager_action; +} + +/* + * User interface specific /sys/power/tuxonice entries. + */ + +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate), + SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL), + SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0, + NULL), + SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1, + 0, storage_manager_activate) +}; + +static struct toi_module_ops usm_ops = { + .type = MISC_MODULE, + .name = "usm", + .directory = "storage_manager", + .module = THIS_MODULE, + .storage_needed = usm_storage_needed, + .save_config_info = usm_save_config_info, + .load_config_info = usm_load_config_info, + .memory_needed = usm_memory_needed, + + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +/* toi_usm_sysfs_init + * Description: Boot time initialisation for user interface. + */ +int toi_usm_init(void) +{ + usm_helper_data.nl = NULL; + usm_helper_data.program[0] = '\0'; + usm_helper_data.pid = -1; + usm_helper_data.skb_size = 0; + usm_helper_data.pool_limit = 6; + usm_helper_data.netlink_id = NETLINK_TOI_USM; + usm_helper_data.name = "userspace storage manager"; + usm_helper_data.rcv_msg = usm_user_rcv_msg; + usm_helper_data.interface_version = 2; + usm_helper_data.must_init = 0; + init_completion(&usm_helper_data.wait_for_process); + + return toi_register_module(&usm_ops); +} + +void toi_usm_exit(void) +{ + toi_netlink_close_complete(&usm_helper_data); + toi_unregister_module(&usm_ops); +} diff -Nur linux-4.2/kernel/power/tuxonice_storage.h linux-4.2-pck/kernel/power/tuxonice_storage.h --- linux-4.2/kernel/power/tuxonice_storage.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_storage.h 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,45 @@ +/* + * kernel/power/tuxonice_storage.h + * + * Copyright (C) 2005-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + */ + +#ifdef CONFIG_NET +int toi_prepare_usm(void); +void toi_cleanup_usm(void); + +int toi_activate_storage(int force); +int toi_deactivate_storage(int force); +extern int toi_usm_init(void); +extern void toi_usm_exit(void); +#else +static inline int toi_usm_init(void) { return 0; } +static inline void toi_usm_exit(void) { } + +static inline int toi_activate_storage(int force) +{ + return 0; +} + +static inline int toi_deactivate_storage(int force) +{ + return 0; +} + +static inline int toi_prepare_usm(void) { return 0; } +static inline void toi_cleanup_usm(void) { } +#endif + +enum { + USM_MSG_BASE = 0x10, + + /* Kernel -> Userspace */ + USM_MSG_CONNECT = 0x30, + USM_MSG_DISCONNECT = 0x31, + USM_MSG_SUCCESS = 0x40, + USM_MSG_FAILED = 0x41, + + USM_MSG_MAX, +}; diff -Nur linux-4.2/kernel/power/tuxonice_swap.c linux-4.2-pck/kernel/power/tuxonice_swap.c --- linux-4.2/kernel/power/tuxonice_swap.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_swap.c 2015-09-08 11:20:32.470345146 -0300 @@ -0,0 +1,474 @@ +/* + * kernel/power/tuxonice_swap.c + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * Distributed under GPLv2. + * + * This file encapsulates functions for usage of swap space as a + * backing store. + */ + +#include +#include +#include +#include +#include +#include + +#include "tuxonice.h" +#include "tuxonice_sysfs.h" +#include "tuxonice_modules.h" +#include "tuxonice_io.h" +#include "tuxonice_ui.h" +#include "tuxonice_extent.h" +#include "tuxonice_bio.h" +#include "tuxonice_alloc.h" +#include "tuxonice_builtin.h" + +static struct toi_module_ops toi_swapops; + +/* For swapfile automatically swapon/off'd. */ +static char swapfilename[255] = ""; +static int toi_swapon_status; + +/* Swap Pages */ +static unsigned long swap_allocated; + +static struct sysinfo swapinfo; + +static int is_ram_backed(struct swap_info_struct *si) +{ + if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) || + !strncmp(si->bdev->bd_disk->disk_name, "zram", 4)) + return 1; + + return 0; +} + +/** + * enable_swapfile: Swapon the user specified swapfile prior to hibernating. + * + * Activate the given swapfile if it wasn't already enabled. Remember whether + * we really did swapon it for swapoffing later. + */ +static void enable_swapfile(void) +{ + int activateswapresult = -EINVAL; + + if (swapfilename[0]) { + /* Attempt to swap on with maximum priority */ + activateswapresult = sys_swapon(swapfilename, 0xFFFF); + if (activateswapresult && activateswapresult != -EBUSY) + printk(KERN_ERR "TuxOnIce: The swapfile/partition " + "specified by /sys/power/tuxonice/swap/swapfile" + " (%s) could not be turned on (error %d). " + "Attempting to continue.\n", + swapfilename, activateswapresult); + if (!activateswapresult) + toi_swapon_status = 1; + } +} + +/** + * disable_swapfile: Swapoff any file swaponed at the start of the cycle. + * + * If we did successfully swapon a file at the start of the cycle, swapoff + * it now (finishing up). + */ +static void disable_swapfile(void) +{ + if (!toi_swapon_status) + return; + + sys_swapoff(swapfilename); + toi_swapon_status = 0; +} + +static int add_blocks_to_extent_chain(struct toi_bdev_info *chain, + unsigned long start, unsigned long end) +{ + if (test_action_state(TOI_TEST_BIO)) + toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to " + "chain %p.", start << chain->bmap_shift, + end << chain->bmap_shift, chain); + + return toi_add_to_extent_chain(&chain->blocks, start, end); +} + + +static int get_main_pool_phys_params(struct toi_bdev_info *chain) +{ + struct hibernate_extent *extentpointer = NULL; + unsigned long address, extent_min = 0, extent_max = 0; + int empty = 1; + + toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for " + "chain %d.", chain->allocator_index); + + if (!chain->allocations.first) + return 0; + + if (chain->blocks.first) + toi_put_extent_chain(&chain->blocks); + + toi_extent_for_each(&chain->allocations, extentpointer, address) { + swp_entry_t swap_address = (swp_entry_t) { address }; + struct block_device *bdev; + sector_t new_sector = map_swap_entry(swap_address, &bdev); + + if (empty) { + empty = 0; + extent_min = extent_max = new_sector; + continue; + } + + if (new_sector == extent_max + 1) { + extent_max++; + continue; + } + + if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) { + printk(KERN_ERR "Out of memory while making block " + "chains.\n"); + return -ENOMEM; + } + + extent_min = new_sector; + extent_max = new_sector; + } + + if (!empty && + add_blocks_to_extent_chain(chain, extent_min, extent_max)) { + printk(KERN_ERR "Out of memory while making block chains.\n"); + return -ENOMEM; + } + + return 0; +} + +/* + * Like si_swapinfo, except that we don't include ram backed swap (compcache!) + * and don't need to use the spinlocks (userspace is stopped when this + * function is called). + */ +void si_swapinfo_no_compcache(void) +{ + unsigned int i; + + si_swapinfo(&swapinfo); + swapinfo.freeswap = 0; + swapinfo.totalswap = 0; + + for (i = 0; i < MAX_SWAPFILES; i++) { + struct swap_info_struct *si = get_swap_info_struct(i); + if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) { + swapinfo.totalswap += si->inuse_pages; + swapinfo.freeswap += si->pages - si->inuse_pages; + } + } +} +/* + * We can't just remember the value from allocation time, because other + * processes might have allocated swap in the mean time. + */ +static unsigned long toi_swap_storage_available(void) +{ + toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available."); + si_swapinfo_no_compcache(); + return swapinfo.freeswap + swap_allocated; +} + +static int toi_swap_initialise(int starting_cycle) +{ + if (!starting_cycle) + return 0; + + enable_swapfile(); + return 0; +} + +static void toi_swap_cleanup(int ending_cycle) +{ + if (!ending_cycle) + return; + + disable_swapfile(); +} + +static void toi_swap_free_storage(struct toi_bdev_info *chain) +{ + /* Free swap entries */ + struct hibernate_extent *extentpointer; + unsigned long extentvalue; + + toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.", + chain); + + swap_allocated -= chain->allocations.size; + toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) + swap_free((swp_entry_t) { extentvalue }); + + toi_put_extent_chain(&chain->allocations); +} + +static void free_swap_range(unsigned long min, unsigned long max) +{ + int j; + + for (j = min; j <= max; j++) + swap_free((swp_entry_t) { j }); + swap_allocated -= (max - min + 1); +} + +/* + * Allocation of a single swap type. Swap priorities are handled at the higher + * level. + */ +static int toi_swap_allocate_storage(struct toi_bdev_info *chain, + unsigned long request) +{ + unsigned long gotten = 0; + + toi_message(TOI_IO, TOI_VERBOSE, 0, " Swap allocate storage: Asked to" + " allocate %lu pages from device %d.", request, + chain->allocator_index); + + while (gotten < request) { + swp_entry_t start, end; + if (0) { + /* Broken at the moment for SSDs */ + get_swap_range_of_type(chain->allocator_index, &start, &end, + request - gotten + 1); + } else { + start = end = get_swap_page_of_type(chain->allocator_index); + } + if (start.val) { + int added = end.val - start.val + 1; + if (toi_add_to_extent_chain(&chain->allocations, + start.val, end.val)) { + printk(KERN_INFO "Failed to allocate extent for " + "%lu-%lu.\n", start.val, end.val); + free_swap_range(start.val, end.val); + break; + } + gotten += added; + swap_allocated += added; + } else + break; + } + + toi_message(TOI_IO, TOI_VERBOSE, 0, " Allocated %lu pages.", gotten); + return gotten; +} + +static int toi_swap_register_storage(void) +{ + int i, result = 0; + + toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage."); + for (i = 0; i < MAX_SWAPFILES; i++) { + struct swap_info_struct *si = get_swap_info_struct(i); + struct toi_bdev_info *devinfo; + unsigned char *p; + unsigned char buf[256]; + struct fs_info *fs_info; + + if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si)) + continue; + + devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), + GFP_ATOMIC); + if (!devinfo) { + printk("Failed to allocate devinfo struct for swap " + "device %d.\n", i); + return -ENOMEM; + } + + devinfo->bdev = si->bdev; + devinfo->allocator = &toi_swapops; + devinfo->allocator_index = i; + + fs_info = fs_info_from_block_dev(si->bdev); + if (fs_info && !IS_ERR(fs_info)) { + memcpy(devinfo->uuid, &fs_info->uuid, 16); + free_fs_info(fs_info); + } else + result = (int) PTR_ERR(fs_info); + + if (!fs_info) + printk("fs_info from block dev returned %d.\n", result); + devinfo->dev_t = si->bdev->bd_dev; + devinfo->prio = si->prio; + devinfo->bmap_shift = 3; + devinfo->blocks_per_page = 1; + + p = d_path(&si->swap_file->f_path, buf, sizeof(buf)); + sprintf(devinfo->name, "swap on %s", p); + + toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:" + " Device %d (%lx), prio %d.", i, + (unsigned long) devinfo->dev_t, devinfo->prio); + toi_bio_ops.register_storage(devinfo); + } + + return 0; +} + +static unsigned long toi_swap_free_unused_storage(struct toi_bdev_info *chain, unsigned long used) +{ + struct hibernate_extent *extentpointer = NULL; + unsigned long extentvalue; + unsigned long i = 0, first_freed = 0; + + toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) { + i++; + if (i > used) { + swap_free((swp_entry_t) { extentvalue }); + if (!first_freed) + first_freed = extentvalue; + } + } + + return first_freed; +} + +/* + * workspace_size + * + * Description: + * Returns the number of bytes of RAM needed for this + * code to do its work. (Used when calculating whether + * we have enough memory to be able to hibernate & resume). + * + */ +static int toi_swap_memory_needed(void) +{ + return 1; +} + +/* + * Print debug info + * + * Description: + */ +static int toi_swap_print_debug_stats(char *buffer, int size) +{ + int len = 0; + + len = scnprintf(buffer, size, "- Swap Allocator enabled.\n"); + if (swapfilename[0]) + len += scnprintf(buffer+len, size-len, + " Attempting to automatically swapon: %s.\n", + swapfilename); + + si_swapinfo_no_compcache(); + + len += scnprintf(buffer+len, size-len, + " Swap available for image: %lu pages.\n", + swapinfo.freeswap + swap_allocated); + + return len; +} + +static int header_locations_read_sysfs(const char *page, int count) +{ + int i, printedpartitionsmessage = 0, len = 0, haveswap = 0; + struct inode *swapf = NULL; + int zone; + char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL); + char *path, *output = (char *) page; + int path_len; + + if (!page) + return 0; + + for (i = 0; i < MAX_SWAPFILES; i++) { + struct swap_info_struct *si = get_swap_info_struct(i); + + if (!si || !(si->flags & SWP_WRITEOK)) + continue; + + if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) { + haveswap = 1; + if (!printedpartitionsmessage) { + len += sprintf(output + len, + "For swap partitions, simply use the " + "format: resume=swap:/dev/hda1.\n"); + printedpartitionsmessage = 1; + } + } else { + path_len = 0; + + path = d_path(&si->swap_file->f_path, path_page, + PAGE_SIZE); + path_len = snprintf(path_page, PAGE_SIZE, "%s", path); + + haveswap = 1; + swapf = si->swap_file->f_mapping->host; + zone = bmap(swapf, 0); + if (!zone) { + len += sprintf(output + len, + "Swapfile %s has been corrupted. Reuse" + " mkswap on it and try again.\n", + path_page); + } else { + char name_buffer[BDEVNAME_SIZE]; + len += sprintf(output + len, + "For swapfile `%s`," + " use resume=swap:/dev/%s:0x%x.\n", + path_page, + bdevname(si->bdev, name_buffer), + zone << (swapf->i_blkbits - 9)); + } + } + } + + if (!haveswap) + len = sprintf(output, "You need to turn on swap partitions " + "before examining this file.\n"); + + toi_free_page(10, (unsigned long) path_page); + return len; +} + +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL), + SYSFS_CUSTOM("headerlocations", SYSFS_READONLY, + header_locations_read_sysfs, NULL, 0, NULL), + SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0, + attempt_to_parse_resume_device2), +}; + +static struct toi_bio_allocator_ops toi_bio_swapops = { + .register_storage = toi_swap_register_storage, + .storage_available = toi_swap_storage_available, + .allocate_storage = toi_swap_allocate_storage, + .bmap = get_main_pool_phys_params, + .free_storage = toi_swap_free_storage, + .free_unused_storage = toi_swap_free_unused_storage, +}; + +static struct toi_module_ops toi_swapops = { + .type = BIO_ALLOCATOR_MODULE, + .name = "swap storage", + .directory = "swap", + .module = THIS_MODULE, + .memory_needed = toi_swap_memory_needed, + .print_debug_info = toi_swap_print_debug_stats, + .initialise = toi_swap_initialise, + .cleanup = toi_swap_cleanup, + .bio_allocator_ops = &toi_bio_swapops, + + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +/* ---- Registration ---- */ +static __init int toi_swap_load(void) +{ + return toi_register_module(&toi_swapops); +} + +late_initcall(toi_swap_load); diff -Nur linux-4.2/kernel/power/tuxonice_sysfs.c linux-4.2-pck/kernel/power/tuxonice_sysfs.c --- linux-4.2/kernel/power/tuxonice_sysfs.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_sysfs.c 2015-09-08 11:20:32.473678317 -0300 @@ -0,0 +1,333 @@ +/* + * kernel/power/tuxonice_sysfs.c + * + * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * This file contains support for sysfs entries for tuning TuxOnIce. + * + * We have a generic handler that deals with the most common cases, and + * hooks for special handlers to use. + */ + +#include + +#include "tuxonice_sysfs.h" +#include "tuxonice.h" +#include "tuxonice_storage.h" +#include "tuxonice_alloc.h" + +static int toi_sysfs_initialised; + +static void toi_initialise_sysfs(void); + +static struct toi_sysfs_data sysfs_params[]; + +#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr) + +static void toi_main_wrapper(void) +{ + toi_try_hibernate(); +} + +static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); + int len = 0; + int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ; + + if (full_prep && toi_start_anything(0)) + return -EBUSY; + + if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) + toi_prepare_usm(); + + switch (sysfs_data->type) { + case TOI_SYSFS_DATA_CUSTOM: + len = (sysfs_data->data.special.read_sysfs) ? + (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE) + : 0; + break; + case TOI_SYSFS_DATA_BIT: + len = sprintf(page, "%d\n", + -test_bit(sysfs_data->data.bit.bit, + sysfs_data->data.bit.bit_vector)); + break; + case TOI_SYSFS_DATA_INTEGER: + len = sprintf(page, "%d\n", + *(sysfs_data->data.integer.variable)); + break; + case TOI_SYSFS_DATA_LONG: + len = sprintf(page, "%ld\n", + *(sysfs_data->data.a_long.variable)); + break; + case TOI_SYSFS_DATA_UL: + len = sprintf(page, "%lu\n", + *(sysfs_data->data.ul.variable)); + break; + case TOI_SYSFS_DATA_STRING: + len = sprintf(page, "%s\n", + sysfs_data->data.string.variable); + break; + } + + if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) + toi_cleanup_usm(); + + if (full_prep) + toi_finish_anything(0); + + return len; +} + +#define BOUND(_variable, _type) do { \ + if (*_variable < sysfs_data->data._type.minimum) \ + *_variable = sysfs_data->data._type.minimum; \ + else if (*_variable > sysfs_data->data._type.maximum) \ + *_variable = sysfs_data->data._type.maximum; \ +} while (0) + +static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr, + const char *my_buf, size_t count) +{ + int assigned_temp_buffer = 0, result = count; + struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); + + if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME))) + return -EBUSY; + + ((char *) my_buf)[count] = 0; + + if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) + toi_prepare_usm(); + + switch (sysfs_data->type) { + case TOI_SYSFS_DATA_CUSTOM: + if (sysfs_data->data.special.write_sysfs) + result = (sysfs_data->data.special.write_sysfs)(my_buf, + count); + break; + case TOI_SYSFS_DATA_BIT: + { + unsigned long value; + result = kstrtoul(my_buf, 0, &value); + if (result) + break; + if (value) + set_bit(sysfs_data->data.bit.bit, + (sysfs_data->data.bit.bit_vector)); + else + clear_bit(sysfs_data->data.bit.bit, + (sysfs_data->data.bit.bit_vector)); + } + break; + case TOI_SYSFS_DATA_INTEGER: + { + long temp; + result = kstrtol(my_buf, 0, &temp); + if (result) + break; + *(sysfs_data->data.integer.variable) = (int) temp; + BOUND(sysfs_data->data.integer.variable, integer); + break; + } + case TOI_SYSFS_DATA_LONG: + { + long *variable = + sysfs_data->data.a_long.variable; + result = kstrtol(my_buf, 0, variable); + if (result) + break; + BOUND(variable, a_long); + break; + } + case TOI_SYSFS_DATA_UL: + { + unsigned long *variable = + sysfs_data->data.ul.variable; + result = kstrtoul(my_buf, 0, variable); + if (result) + break; + BOUND(variable, ul); + break; + } + break; + case TOI_SYSFS_DATA_STRING: + { + int copy_len = count; + char *variable = + sysfs_data->data.string.variable; + + if (sysfs_data->data.string.max_length && + (copy_len > sysfs_data->data.string.max_length)) + copy_len = sysfs_data->data.string.max_length; + + if (!variable) { + variable = (char *) toi_get_zeroed_page(31, + TOI_ATOMIC_GFP); + sysfs_data->data.string.variable = variable; + assigned_temp_buffer = 1; + } + strncpy(variable, my_buf, copy_len); + if (copy_len && my_buf[copy_len - 1] == '\n') + variable[count - 1] = 0; + variable[count] = 0; + } + break; + } + + if (!result) + result = count; + + /* Side effect routine? */ + if (result == count && sysfs_data->write_side_effect) + sysfs_data->write_side_effect(); + + /* Free temporary buffers */ + if (assigned_temp_buffer) { + toi_free_page(31, + (unsigned long) sysfs_data->data.string.variable); + sysfs_data->data.string.variable = NULL; + } + + if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) + toi_cleanup_usm(); + + toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME); + + return result; +} + +static struct sysfs_ops toi_sysfs_ops = { + .show = &toi_attr_show, + .store = &toi_attr_store, +}; + +static struct kobj_type toi_ktype = { + .sysfs_ops = &toi_sysfs_ops, +}; + +struct kobject *tuxonice_kobj; + +/* Non-module sysfs entries. + * + * This array contains entries that are automatically registered at + * boot. Modules and the console code register their own entries separately. + */ + +static struct toi_sysfs_data sysfs_params[] = { + SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL, + SYSFS_HIBERNATING, toi_main_wrapper), + SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL, + SYSFS_RESUMING, toi_try_resume) +}; + +void remove_toi_sysdir(struct kobject *kobj) +{ + if (!kobj) + return; + + kobject_put(kobj); +} + +struct kobject *make_toi_sysdir(char *name) +{ + struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj); + + if (!kobj) { + printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs " + "dir!\n"); + return NULL; + } + + kobj->ktype = &toi_ktype; + + return kobj; +} + +/* toi_register_sysfs_file + * + * Helper for registering a new /sysfs/tuxonice entry. + */ + +int toi_register_sysfs_file( + struct kobject *kobj, + struct toi_sysfs_data *toi_sysfs_data) +{ + int result; + + if (!toi_sysfs_initialised) + toi_initialise_sysfs(); + + result = sysfs_create_file(kobj, &toi_sysfs_data->attr); + if (result) + printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s " + "returned %d.\n", + toi_sysfs_data->attr.name, result); + kobj->ktype = &toi_ktype; + + return result; +} + +/* toi_unregister_sysfs_file + * + * Helper for removing unwanted /sys/power/tuxonice entries. + * + */ +void toi_unregister_sysfs_file(struct kobject *kobj, + struct toi_sysfs_data *toi_sysfs_data) +{ + sysfs_remove_file(kobj, &toi_sysfs_data->attr); +} + +void toi_cleanup_sysfs(void) +{ + int i, + numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); + + if (!toi_sysfs_initialised) + return; + + for (i = 0; i < numfiles; i++) + toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]); + + kobject_put(tuxonice_kobj); + toi_sysfs_initialised = 0; +} + +/* toi_initialise_sysfs + * + * Initialise the /sysfs/tuxonice directory. + */ + +static void toi_initialise_sysfs(void) +{ + int i; + int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); + + if (toi_sysfs_initialised) + return; + + /* Make our TuxOnIce directory a child of /sys/power */ + tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj); + if (!tuxonice_kobj) + return; + + toi_sysfs_initialised = 1; + + for (i = 0; i < numfiles; i++) + toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); +} + +int toi_sysfs_init(void) +{ + toi_initialise_sysfs(); + return 0; +} + +void toi_sysfs_exit(void) +{ + toi_cleanup_sysfs(); +} diff -Nur linux-4.2/kernel/power/tuxonice_sysfs.h linux-4.2-pck/kernel/power/tuxonice_sysfs.h --- linux-4.2/kernel/power/tuxonice_sysfs.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_sysfs.h 2015-09-08 11:20:32.473678317 -0300 @@ -0,0 +1,137 @@ +/* + * kernel/power/tuxonice_sysfs.h + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + */ + +#include + +struct toi_sysfs_data { + struct attribute attr; + int type; + int flags; + union { + struct { + unsigned long *bit_vector; + int bit; + } bit; + struct { + int *variable; + int minimum; + int maximum; + } integer; + struct { + long *variable; + long minimum; + long maximum; + } a_long; + struct { + unsigned long *variable; + unsigned long minimum; + unsigned long maximum; + } ul; + struct { + char *variable; + int max_length; + } string; + struct { + int (*read_sysfs) (const char *buffer, int count); + int (*write_sysfs) (const char *buffer, int count); + void *data; + } special; + } data; + + /* Side effects routine. Used, eg, for reparsing the + * resume= entry when it changes */ + void (*write_side_effect) (void); + struct list_head sysfs_data_list; +}; + +enum { + TOI_SYSFS_DATA_NONE = 1, + TOI_SYSFS_DATA_CUSTOM, + TOI_SYSFS_DATA_BIT, + TOI_SYSFS_DATA_INTEGER, + TOI_SYSFS_DATA_UL, + TOI_SYSFS_DATA_LONG, + TOI_SYSFS_DATA_STRING +}; + +#define SYSFS_WRITEONLY 0200 +#define SYSFS_READONLY 0444 +#define SYSFS_RW 0644 + +#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \ + .attr = {.name = _name , .mode = _mode }, \ + .type = TOI_SYSFS_DATA_BIT, \ + .flags = _flags, \ + .data = { .bit = { .bit_vector = _ul, .bit = _bit } } } + +#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \ + .attr = {.name = _name , .mode = _mode }, \ + .type = TOI_SYSFS_DATA_INTEGER, \ + .flags = _flags, \ + .data = { .integer = { .variable = _int, .minimum = _min, \ + .maximum = _max } }, \ + .write_side_effect = _wse } + +#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \ + .attr = {.name = _name , .mode = _mode }, \ + .type = TOI_SYSFS_DATA_UL, \ + .flags = _flags, \ + .data = { .ul = { .variable = _ul, .minimum = _min, \ + .maximum = _max } } } + +#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \ + .attr = {.name = _name , .mode = _mode }, \ + .type = TOI_SYSFS_DATA_LONG, \ + .flags = _flags, \ + .data = { .a_long = { .variable = _long, .minimum = _min, \ + .maximum = _max } } } + +#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \ + .attr = {.name = _name , .mode = _mode }, \ + .type = TOI_SYSFS_DATA_STRING, \ + .flags = _flags, \ + .data = { .string = { .variable = _string, .max_length = _max_len } }, \ + .write_side_effect = _wse } + +#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \ + .attr = {.name = _name , .mode = _mode }, \ + .type = TOI_SYSFS_DATA_CUSTOM, \ + .flags = _flags, \ + .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \ + .write_side_effect = _wse } + +#define SYSFS_NONE(_name, _wse) { \ + .attr = {.name = _name , .mode = SYSFS_WRITEONLY }, \ + .type = TOI_SYSFS_DATA_NONE, \ + .write_side_effect = _wse, \ +} + +/* Flags */ +#define SYSFS_NEEDS_SM_FOR_READ 1 +#define SYSFS_NEEDS_SM_FOR_WRITE 2 +#define SYSFS_HIBERNATE 4 +#define SYSFS_RESUME 8 +#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME) +#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE) +#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE) +#define SYSFS_NEEDS_SM_FOR_BOTH \ + (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE) + +int toi_register_sysfs_file(struct kobject *kobj, + struct toi_sysfs_data *toi_sysfs_data); +void toi_unregister_sysfs_file(struct kobject *kobj, + struct toi_sysfs_data *toi_sysfs_data); + +extern struct kobject *tuxonice_kobj; + +struct kobject *make_toi_sysdir(char *name); +void remove_toi_sysdir(struct kobject *obj); +extern void toi_cleanup_sysfs(void); + +extern int toi_sysfs_init(void); +extern void toi_sysfs_exit(void); diff -Nur linux-4.2/kernel/power/tuxonice_ui.c linux-4.2-pck/kernel/power/tuxonice_ui.c --- linux-4.2/kernel/power/tuxonice_ui.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_ui.c 2015-09-08 11:20:32.473678317 -0300 @@ -0,0 +1,247 @@ +/* + * kernel/power/tuxonice_ui.c + * + * Copyright (C) 1998-2001 Gabor Kuti + * Copyright (C) 1998,2001,2002 Pavel Machek + * Copyright (C) 2002-2003 Florent Chabaud + * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * Routines for TuxOnIce's user interface. + * + * The user interface code talks to a userspace program via a + * netlink socket. + * + * The kernel side: + * - starts the userui program; + * - sends text messages and progress bar status; + * + * The user space side: + * - passes messages regarding user requests (abort, toggle reboot etc) + * + */ + +#define __KERNEL_SYSCALLS__ + +#include + +#include "tuxonice_sysfs.h" +#include "tuxonice_modules.h" +#include "tuxonice.h" +#include "tuxonice_ui.h" +#include "tuxonice_netlink.h" +#include "tuxonice_power_off.h" +#include "tuxonice_builtin.h" + +static char local_printf_buf[1024]; /* Same as printk - should be safe */ +struct ui_ops *toi_current_ui; + +/** + * toi_wait_for_keypress - Wait for keypress via userui or /dev/console. + * + * @timeout: Maximum time to wait. + * + * Wait for a keypress, either from userui or /dev/console if userui isn't + * available. The non-userui path is particularly for at boot-time, prior + * to userui being started, when we have an important warning to give to + * the user. + */ +static char toi_wait_for_keypress(int timeout) +{ + if (toi_current_ui && toi_current_ui->wait_for_key(timeout)) + return ' '; + + return toi_wait_for_keypress_dev_console(timeout); +} + +/* toi_early_boot_message() + * Description: Handle errors early in the process of booting. + * The user may press C to continue booting, perhaps + * invalidating the image, or space to reboot. + * This works from either the serial console or normally + * attached keyboard. + * + * Note that we come in here from init, while the kernel is + * locked. If we want to get events from the serial console, + * we need to temporarily unlock the kernel. + * + * toi_early_boot_message may also be called post-boot. + * In this case, it simply printks the message and returns. + * + * Arguments: int Whether we are able to erase the image. + * int default_answer. What to do when we timeout. This + * will normally be continue, but the user might + * provide command line options (__setup) to override + * particular cases. + * Char *. Pointer to a string explaining why we're moaning. + */ + +#define say(message, a...) printk(KERN_EMERG message, ##a) + +void toi_early_boot_message(int message_detail, int default_answer, + char *warning_reason, ...) +{ +#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) + unsigned long orig_state = get_toi_state(), continue_req = 0; + unsigned long orig_loglevel = console_loglevel; + int can_ask = 1; +#else + int can_ask = 0; +#endif + + va_list args; + int printed_len; + + if (!toi_wait) { + set_toi_state(TOI_CONTINUE_REQ); + can_ask = 0; + } + + if (warning_reason) { + va_start(args, warning_reason); + printed_len = vsnprintf(local_printf_buf, + sizeof(local_printf_buf), + warning_reason, + args); + va_end(args); + } + + if (!test_toi_state(TOI_BOOT_TIME)) { + printk("TuxOnIce: %s\n", local_printf_buf); + return; + } + + if (!can_ask) { + continue_req = !!default_answer; + goto post_ask; + } + +#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) + console_loglevel = 7; + + say("=== TuxOnIce ===\n\n"); + if (warning_reason) { + say("BIG FAT WARNING!! %s\n\n", local_printf_buf); + switch (message_detail) { + case 0: + say("If you continue booting, note that any image WILL" + "NOT BE REMOVED.\nTuxOnIce is unable to do so " + "because the appropriate modules aren't\n" + "loaded. You should manually remove the image " + "to avoid any\npossibility of corrupting your " + "filesystem(s) later.\n"); + break; + case 1: + say("If you want to use the current TuxOnIce image, " + "reboot and try\nagain with the same kernel " + "that you hibernated from. If you want\n" + "to forget that image, continue and the image " + "will be erased.\n"); + break; + } + say("Press SPACE to reboot or C to continue booting with " + "this kernel\n\n"); + if (toi_wait > 0) + say("Default action if you don't select one in %d " + "seconds is: %s.\n", + toi_wait, + default_answer == TOI_CONTINUE_REQ ? + "continue booting" : "reboot"); + } else { + say("BIG FAT WARNING!!\n\n" + "You have tried to resume from this image before.\n" + "If it failed once, it may well fail again.\n" + "Would you like to remove the image and boot " + "normally?\nThis will be equivalent to entering " + "noresume on the\nkernel command line.\n\n" + "Press SPACE to remove the image or C to continue " + "resuming.\n\n"); + if (toi_wait > 0) + say("Default action if you don't select one in %d " + "seconds is: %s.\n", toi_wait, + !!default_answer ? + "continue resuming" : "remove the image"); + } + console_loglevel = orig_loglevel; + + set_toi_state(TOI_SANITY_CHECK_PROMPT); + clear_toi_state(TOI_CONTINUE_REQ); + + if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */ + continue_req = !!default_answer; + else + continue_req = test_toi_state(TOI_CONTINUE_REQ); + +#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */ + +post_ask: + if ((warning_reason) && (!continue_req)) + kernel_restart(NULL); + + restore_toi_state(orig_state); + if (continue_req) + set_toi_state(TOI_CONTINUE_REQ); +} + +#undef say + +/* + * User interface specific /sys/power/tuxonice entries. + */ + +static struct toi_sysfs_data sysfs_params[] = { +#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) + SYSFS_INT("default_console_level", SYSFS_RW, + &toi_bkd.toi_default_console_level, 0, 7, 0, NULL), + SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0, + 1 << 30, 0), + SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL, + 0) +#endif +}; + +static struct toi_module_ops userui_ops = { + .type = MISC_HIDDEN_MODULE, + .name = "printk ui", + .directory = "user_interface", + .module = THIS_MODULE, + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +int toi_register_ui_ops(struct ui_ops *this_ui) +{ + if (toi_current_ui) { + printk(KERN_INFO "Only one TuxOnIce user interface module can " + "be loaded at a time."); + return -EBUSY; + } + + toi_current_ui = this_ui; + + return 0; +} + +void toi_remove_ui_ops(struct ui_ops *this_ui) +{ + if (toi_current_ui != this_ui) + return; + + toi_current_ui = NULL; +} + +/* toi_console_sysfs_init + * Description: Boot time initialisation for user interface. + */ + +int toi_ui_init(void) +{ + return toi_register_module(&userui_ops); +} + +void toi_ui_exit(void) +{ + toi_unregister_module(&userui_ops); +} diff -Nur linux-4.2/kernel/power/tuxonice_ui.h linux-4.2-pck/kernel/power/tuxonice_ui.h --- linux-4.2/kernel/power/tuxonice_ui.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_ui.h 2015-09-08 11:20:32.473678317 -0300 @@ -0,0 +1,97 @@ +/* + * kernel/power/tuxonice_ui.h + * + * Copyright (C) 2004-2015 Nigel Cunningham (nigel at nigelcunningham com au) + */ + +enum { + DONT_CLEAR_BAR, + CLEAR_BAR +}; + +enum { + /* Userspace -> Kernel */ + USERUI_MSG_ABORT = 0x11, + USERUI_MSG_SET_STATE = 0x12, + USERUI_MSG_GET_STATE = 0x13, + USERUI_MSG_GET_DEBUG_STATE = 0x14, + USERUI_MSG_SET_DEBUG_STATE = 0x15, + USERUI_MSG_SPACE = 0x18, + USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A, + USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B, + USERUI_MSG_GET_LOGLEVEL = 0x1C, + USERUI_MSG_SET_LOGLEVEL = 0x1D, + USERUI_MSG_PRINTK = 0x1E, + + /* Kernel -> Userspace */ + USERUI_MSG_MESSAGE = 0x21, + USERUI_MSG_PROGRESS = 0x22, + USERUI_MSG_POST_ATOMIC_RESTORE = 0x25, + + USERUI_MSG_MAX, +}; + +struct userui_msg_params { + u32 a, b, c, d; + char text[255]; +}; + +struct ui_ops { + char (*wait_for_key) (int timeout); + u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...); + void (*prepare_status) (int clearbar, const char *fmt, ...); + void (*cond_pause) (int pause, char *message); + void (*abort)(int result_code, const char *fmt, ...); + void (*prepare)(void); + void (*cleanup)(void); + void (*message)(u32 section, u32 level, u32 normally_logged, + const char *fmt, ...); +}; + +extern struct ui_ops *toi_current_ui; + +#define toi_update_status(val, max, fmt, args...) \ + (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \ + max) + +#define toi_prepare_console(void) \ + do { if (toi_current_ui) \ + (toi_current_ui->prepare)(); \ + } while (0) + +#define toi_cleanup_console(void) \ + do { if (toi_current_ui) \ + (toi_current_ui->cleanup)(); \ + } while (0) + +#define abort_hibernate(result, fmt, args...) \ + do { if (toi_current_ui) \ + (toi_current_ui->abort)(result, fmt, ##args); \ + else { \ + set_abort_result(result); \ + } \ + } while (0) + +#define toi_cond_pause(pause, message) \ + do { if (toi_current_ui) \ + (toi_current_ui->cond_pause)(pause, message); \ + } while (0) + +#define toi_prepare_status(clear, fmt, args...) \ + do { if (toi_current_ui) \ + (toi_current_ui->prepare_status)(clear, fmt, ##args); \ + else \ + printk(KERN_INFO fmt "%s", ##args, "\n"); \ + } while (0) + +#define toi_message(sn, lev, log, fmt, a...) \ +do { \ + if (toi_current_ui && (!sn || test_debug_state(sn))) \ + toi_current_ui->message(sn, lev, log, fmt, ##a); \ +} while (0) + +__exit void toi_ui_cleanup(void); +extern int toi_ui_init(void); +extern void toi_ui_exit(void); +extern int toi_register_ui_ops(struct ui_ops *this_ui); +extern void toi_remove_ui_ops(struct ui_ops *this_ui); diff -Nur linux-4.2/kernel/power/tuxonice_userui.c linux-4.2-pck/kernel/power/tuxonice_userui.c --- linux-4.2/kernel/power/tuxonice_userui.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/kernel/power/tuxonice_userui.c 2015-09-08 11:20:32.473678317 -0300 @@ -0,0 +1,658 @@ +/* + * kernel/power/user_ui.c + * + * Copyright (C) 2005-2007 Bernard Blackham + * Copyright (C) 2002-2015 Nigel Cunningham (nigel at nigelcunningham com au) + * + * This file is released under the GPLv2. + * + * Routines for TuxOnIce's user interface. + * + * The user interface code talks to a userspace program via a + * netlink socket. + * + * The kernel side: + * - starts the userui program; + * - sends text messages and progress bar status; + * + * The user space side: + * - passes messages regarding user requests (abort, toggle reboot etc) + * + */ + +#define __KERNEL_SYSCALLS__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tuxonice_sysfs.h" +#include "tuxonice_modules.h" +#include "tuxonice.h" +#include "tuxonice_ui.h" +#include "tuxonice_netlink.h" +#include "tuxonice_power_off.h" + +static char local_printf_buf[1024]; /* Same as printk - should be safe */ + +static struct user_helper_data ui_helper_data; +static struct toi_module_ops userui_ops; +static int orig_kmsg; + +static char lastheader[512]; +static int lastheader_message_len; +static int ui_helper_changed; /* Used at resume-time so don't overwrite value + set from initrd/ramfs. */ + +/* Number of distinct progress amounts that userspace can display */ +static int progress_granularity = 30; + +static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key); +static int userui_wait_should_wake; + +#define toi_stop_waiting_for_userui_key() \ +{ \ + userui_wait_should_wake = true; \ + wake_up_interruptible(&userui_wait_for_key); \ +} + +/** + * ui_nl_set_state - Update toi_action based on a message from userui. + * + * @n: The bit (1 << bit) to set. + */ +static void ui_nl_set_state(int n) +{ + /* Only let them change certain settings */ + static const u32 toi_action_mask = + (1 << TOI_REBOOT) | (1 << TOI_PAUSE) | + (1 << TOI_LOGALL) | + (1 << TOI_SINGLESTEP) | + (1 << TOI_PAUSE_NEAR_PAGESET_END); + static unsigned long new_action; + + new_action = (toi_bkd.toi_action & (~toi_action_mask)) | + (n & toi_action_mask); + + printk(KERN_DEBUG "n is %x. Action flags being changed from %lx " + "to %lx.", n, toi_bkd.toi_action, new_action); + toi_bkd.toi_action = new_action; + + if (!test_action_state(TOI_PAUSE) && + !test_action_state(TOI_SINGLESTEP)) + toi_stop_waiting_for_userui_key(); +} + +/** + * userui_post_atomic_restore - Tell userui that atomic restore just happened. + * + * Tell userui that atomic restore just occured, so that it can do things like + * redrawing the screen, re-getting settings and so on. + */ +static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd) +{ + toi_send_netlink_message(&ui_helper_data, + USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0); +} + +/** + * userui_storage_needed - Report how much memory in image header is needed. + */ +static int userui_storage_needed(void) +{ + return sizeof(ui_helper_data.program) + 1 + sizeof(int); +} + +/** + * userui_save_config_info - Fill buffer with config info for image header. + * + * @buf: Buffer into which to put the config info we want to save. + */ +static int userui_save_config_info(char *buf) +{ + *((int *) buf) = progress_granularity; + memcpy(buf + sizeof(int), ui_helper_data.program, + sizeof(ui_helper_data.program)); + return sizeof(ui_helper_data.program) + sizeof(int) + 1; +} + +/** + * userui_load_config_info - Restore config info from buffer. + * + * @buf: Buffer containing header info loaded. + * @size: Size of data loaded for this module. + */ +static void userui_load_config_info(char *buf, int size) +{ + progress_granularity = *((int *) buf); + size -= sizeof(int); + + /* Don't load the saved path if one has already been set */ + if (ui_helper_changed) + return; + + if (size > sizeof(ui_helper_data.program)) + size = sizeof(ui_helper_data.program); + + memcpy(ui_helper_data.program, buf + sizeof(int), size); + ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0'; +} + +/** + * set_ui_program_set: Record that userui program was changed. + * + * Side effect routine for when the userui program is set. In an initrd or + * ramfs, the user may set a location for the userui program. If this happens, + * we don't want to reload the value that was saved in the image header. This + * routine allows us to flag that we shouldn't restore the program name from + * the image header. + */ +static void set_ui_program_set(void) +{ + ui_helper_changed = 1; +} + +/** + * userui_memory_needed - Tell core how much memory to reserve for us. + */ +static int userui_memory_needed(void) +{ + /* ball park figure of 128 pages */ + return 128 * PAGE_SIZE; +} + +/** + * userui_update_status - Update the progress bar and (if on) in-bar message. + * + * @value: Current progress percentage numerator. + * @maximum: Current progress percentage denominator. + * @fmt: Message to be displayed in the middle of the progress bar. + * + * Note that a NULL message does not mean that any previous message is erased! + * For that, you need toi_prepare_status with clearbar on. + * + * Returns an unsigned long, being the next numerator (as determined by the + * maximum and progress granularity) where status needs to be updated. + * This is to reduce unnecessary calls to update_status. + */ +static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...) +{ + static u32 last_step = 9999; + struct userui_msg_params msg; + u32 this_step, next_update; + int bitshift; + + if (ui_helper_data.pid == -1) + return 0; + + if ((!maximum) || (!progress_granularity)) + return maximum; + + if (value < 0) + value = 0; + + if (value > maximum) + value = maximum; + + /* Try to avoid math problems - we can't do 64 bit math here + * (and shouldn't need it - anyone got screen resolution + * of 65536 pixels or more?) */ + bitshift = fls(maximum) - 16; + if (bitshift > 0) { + u32 temp_maximum = maximum >> bitshift; + u32 temp_value = value >> bitshift; + this_step = (u32) + (temp_value * progress_granularity / temp_maximum); + next_update = (((this_step + 1) * temp_maximum / + progress_granularity) + 1) << bitshift; + } else { + this_step = (u32) (value * progress_granularity / maximum); + next_update = ((this_step + 1) * maximum / + progress_granularity) + 1; + } + + if (this_step == last_step) + return next_update; + + memset(&msg, 0, sizeof(msg)); + + msg.a = this_step; + msg.b = progress_granularity; + + if (fmt) { + va_list args; + va_start(args, fmt); + vsnprintf(msg.text, sizeof(msg.text), fmt, args); + va_end(args); + msg.text[sizeof(msg.text)-1] = '\0'; + } + + toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS, + &msg, sizeof(msg)); + last_step = this_step; + + return next_update; +} + +/** + * userui_message - Display a message without necessarily logging it. + * + * @section: Type of message. Messages can be filtered by type. + * @level: Degree of importance of the message. Lower values = higher priority. + * @normally_logged: Whether logged even if log_everything is off. + * @fmt: Message (and parameters). + * + * This function is intended to do the same job as printk, but without normally + * logging what is printed. The point is to be able to get debugging info on + * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M" + * + * It may be called from an interrupt context - can't sleep! + */ +static void userui_message(u32 section, u32 level, u32 normally_logged, + const char *fmt, ...) +{ + struct userui_msg_params msg; + + if ((level) && (level > console_loglevel)) + return; + + memset(&msg, 0, sizeof(msg)); + + msg.a = section; + msg.b = level; + msg.c = normally_logged; + + if (fmt) { + va_list args; + va_start(args, fmt); + vsnprintf(msg.text, sizeof(msg.text), fmt, args); + va_end(args); + msg.text[sizeof(msg.text)-1] = '\0'; + } + + if (test_action_state(TOI_LOGALL)) + printk(KERN_INFO "%s\n", msg.text); + + toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE, + &msg, sizeof(msg)); +} + +/** + * wait_for_key_via_userui - Wait for userui to receive a keypress. + */ +static void wait_for_key_via_userui(void) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&userui_wait_for_key, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + wait_event_interruptible(userui_wait_for_key, userui_wait_should_wake); + userui_wait_should_wake = false; + + set_current_state(TASK_RUNNING); + remove_wait_queue(&userui_wait_for_key, &wait); +} + +/** + * userui_prepare_status - Display high level messages. + * + * @clearbar: Whether to clear the progress bar. + * @fmt...: New message for the title. + * + * Prepare the 'nice display', drawing the header and version, along with the + * current action and perhaps also resetting the progress bar. + */ +static void userui_prepare_status(int clearbar, const char *fmt, ...) +{ + va_list args; + + if (fmt) { + va_start(args, fmt); + lastheader_message_len = vsnprintf(lastheader, 512, fmt, args); + va_end(args); + } + + if (clearbar) + toi_update_status(0, 1, NULL); + + if (ui_helper_data.pid == -1) + printk(KERN_EMERG "%s\n", lastheader); + else + toi_message(0, TOI_STATUS, 1, lastheader, NULL); +} + +/** + * toi_wait_for_keypress - Wait for keypress via userui. + * + * @timeout: Maximum time to wait. + * + * Wait for a keypress from userui. + * + * FIXME: Implement timeout? + */ +static char userui_wait_for_keypress(int timeout) +{ + char key = '\0'; + + if (ui_helper_data.pid != -1) { + wait_for_key_via_userui(); + key = ' '; + } + + return key; +} + +/** + * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it. + * + * @result_code: Reason why we're aborting (1 << bit). + * @fmt: Message to display if telling the user what's going on. + * + * Abort a cycle. If this wasn't at the user's request (and we're displaying + * output), tell the user why and wait for them to acknowledge the message. + */ +static void userui_abort_hibernate(int result_code, const char *fmt, ...) +{ + va_list args; + int printed_len = 0; + + set_result_state(result_code); + + if (test_result_state(TOI_ABORTED)) + return; + + set_result_state(TOI_ABORTED); + + if (test_result_state(TOI_ABORT_REQUESTED)) + return; + + va_start(args, fmt); + printed_len = vsnprintf(local_printf_buf, sizeof(local_printf_buf), + fmt, args); + va_end(args); + if (ui_helper_data.pid != -1) + printed_len = sprintf(local_printf_buf + printed_len, + " (Press SPACE to continue)"); + + toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf); + + if (ui_helper_data.pid != -1) + userui_wait_for_keypress(0); +} + +/** + * request_abort_hibernate - Abort hibernating or resuming at user request. + * + * Handle the user requesting the cancellation of a hibernation or resume by + * pressing escape. + */ +static void request_abort_hibernate(void) +{ + if (test_result_state(TOI_ABORT_REQUESTED) || + !test_action_state(TOI_CAN_CANCEL)) + return; + + if (test_toi_state(TOI_NOW_RESUMING)) { + toi_prepare_status(CLEAR_BAR, "Escape pressed. " + "Powering down again."); + set_toi_state(TOI_STOP_RESUME); + while (!test_toi_state(TOI_IO_STOPPED)) + schedule(); + if (toiActiveAllocator->mark_resume_attempted) + toiActiveAllocator->mark_resume_attempted(0); + toi_power_down(); + } + + toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :" + " ABORTING HIBERNATION ---"); + set_abort_result(TOI_ABORT_REQUESTED); + toi_stop_waiting_for_userui_key(); +} + +/** + * userui_user_rcv_msg - Receive a netlink message from userui. + * + * @skb: skb received. + * @nlh: Netlink header received. + */ +static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int type; + int *data; + + type = nlh->nlmsg_type; + + /* A control message: ignore them */ + if (type < NETLINK_MSG_BASE) + return 0; + + /* Unknown message: reply with EINVAL */ + if (type >= USERUI_MSG_MAX) + return -EINVAL; + + /* All operations require privileges, even GET */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* Only allow one task to receive NOFREEZE privileges */ + if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) { + printk(KERN_INFO "Got NOFREEZE_ME request when " + "ui_helper_data.pid is %d.\n", ui_helper_data.pid); + return -EBUSY; + } + + data = (int *) NLMSG_DATA(nlh); + + switch (type) { + case USERUI_MSG_ABORT: + request_abort_hibernate(); + return 0; + case USERUI_MSG_GET_STATE: + toi_send_netlink_message(&ui_helper_data, + USERUI_MSG_GET_STATE, &toi_bkd.toi_action, + sizeof(toi_bkd.toi_action)); + return 0; + case USERUI_MSG_GET_DEBUG_STATE: + toi_send_netlink_message(&ui_helper_data, + USERUI_MSG_GET_DEBUG_STATE, + &toi_bkd.toi_debug_state, + sizeof(toi_bkd.toi_debug_state)); + return 0; + case USERUI_MSG_SET_STATE: + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) + return -EINVAL; + ui_nl_set_state(*data); + return 0; + case USERUI_MSG_SET_DEBUG_STATE: + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) + return -EINVAL; + toi_bkd.toi_debug_state = (*data); + return 0; + case USERUI_MSG_SPACE: + toi_stop_waiting_for_userui_key(); + return 0; + case USERUI_MSG_GET_POWERDOWN_METHOD: + toi_send_netlink_message(&ui_helper_data, + USERUI_MSG_GET_POWERDOWN_METHOD, + &toi_poweroff_method, + sizeof(toi_poweroff_method)); + return 0; + case USERUI_MSG_SET_POWERDOWN_METHOD: + if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char))) + return -EINVAL; + toi_poweroff_method = (unsigned long)(*data); + return 0; + case USERUI_MSG_GET_LOGLEVEL: + toi_send_netlink_message(&ui_helper_data, + USERUI_MSG_GET_LOGLEVEL, + &toi_bkd.toi_default_console_level, + sizeof(toi_bkd.toi_default_console_level)); + return 0; + case USERUI_MSG_SET_LOGLEVEL: + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) + return -EINVAL; + toi_bkd.toi_default_console_level = (*data); + return 0; + case USERUI_MSG_PRINTK: + printk(KERN_INFO "%s", (char *) data); + return 0; + } + + /* Unhandled here */ + return 1; +} + +/** + * userui_cond_pause - Possibly pause at user request. + * + * @pause: Whether to pause or just display the message. + * @message: Message to display at the start of pausing. + * + * Potentially pause and wait for the user to tell us to continue. We normally + * only pause when @pause is set. While paused, the user can do things like + * changing the loglevel, toggling the display of debugging sections and such + * like. + */ +static void userui_cond_pause(int pause, char *message) +{ + int displayed_message = 0, last_key = 0; + + while (last_key != 32 && + ui_helper_data.pid != -1 && + ((test_action_state(TOI_PAUSE) && pause) || + (test_action_state(TOI_SINGLESTEP)))) { + if (!displayed_message) { + toi_prepare_status(DONT_CLEAR_BAR, + "%s Press SPACE to continue.%s", + message ? message : "", + (test_action_state(TOI_SINGLESTEP)) ? + " Single step on." : ""); + displayed_message = 1; + } + last_key = userui_wait_for_keypress(0); + } + schedule(); +} + +/** + * userui_prepare_console - Prepare the console for use. + * + * Prepare a console for use, saving current kmsg settings and attempting to + * start userui. Console loglevel changes are handled by userui. + */ +static void userui_prepare_console(void) +{ + orig_kmsg = vt_kmsg_redirect(fg_console + 1); + + ui_helper_data.pid = -1; + + if (!userui_ops.enabled) { + printk(KERN_INFO "TuxOnIce: Userui disabled.\n"); + return; + } + + if (*ui_helper_data.program) + toi_netlink_setup(&ui_helper_data); + else + printk(KERN_INFO "TuxOnIce: Userui program not configured.\n"); +} + +/** + * userui_cleanup_console - Cleanup after a cycle. + * + * Tell userui to cleanup, and restore kmsg_redirect to its original value. + */ + +static void userui_cleanup_console(void) +{ + if (ui_helper_data.pid > -1) + toi_netlink_close(&ui_helper_data); + + vt_kmsg_redirect(orig_kmsg); +} + +/* + * User interface specific /sys/power/tuxonice entries. + */ + +static struct toi_sysfs_data sysfs_params[] = { +#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) + SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action, + TOI_CAN_CANCEL, 0), + SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action, + TOI_PAUSE, 0), + SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL), + SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1, + 2048, 0, NULL), + SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0, + set_ui_program_set), + SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL) +#endif +}; + +static struct toi_module_ops userui_ops = { + .type = MISC_MODULE, + .name = "userui", + .shared_directory = "user_interface", + .module = THIS_MODULE, + .storage_needed = userui_storage_needed, + .save_config_info = userui_save_config_info, + .load_config_info = userui_load_config_info, + .memory_needed = userui_memory_needed, + .post_atomic_restore = userui_post_atomic_restore, + .sysfs_data = sysfs_params, + .num_sysfs_entries = sizeof(sysfs_params) / + sizeof(struct toi_sysfs_data), +}; + +static struct ui_ops my_ui_ops = { + .update_status = userui_update_status, + .message = userui_message, + .prepare_status = userui_prepare_status, + .abort = userui_abort_hibernate, + .cond_pause = userui_cond_pause, + .prepare = userui_prepare_console, + .cleanup = userui_cleanup_console, + .wait_for_key = userui_wait_for_keypress, +}; + +/** + * toi_user_ui_init - Boot time initialisation for user interface. + * + * Invoked from the core init routine. + */ +static __init int toi_user_ui_init(void) +{ + int result; + + ui_helper_data.nl = NULL; + strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255); + ui_helper_data.pid = -1; + ui_helper_data.skb_size = sizeof(struct userui_msg_params); + ui_helper_data.pool_limit = 6; + ui_helper_data.netlink_id = NETLINK_TOI_USERUI; + ui_helper_data.name = "userspace ui"; + ui_helper_data.rcv_msg = userui_user_rcv_msg; + ui_helper_data.interface_version = 8; + ui_helper_data.must_init = 0; + ui_helper_data.not_ready = userui_cleanup_console; + init_completion(&ui_helper_data.wait_for_process); + result = toi_register_module(&userui_ops); + if (!result) { + result = toi_register_ui_ops(&my_ui_ops); + if (result) + toi_unregister_module(&userui_ops); + } + + return result; +} + +late_initcall(toi_user_ui_init); diff -Nur linux-4.2/kernel/printk/printk.c linux-4.2-pck/kernel/printk/printk.c --- linux-4.2/kernel/printk/printk.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/printk/printk.c 2015-09-08 11:20:32.500343682 -0300 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -280,6 +281,20 @@ static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +#ifdef CONFIG_TOI_INCREMENTAL +void toi_set_logbuf_untracked(void) +{ + int i; + struct page *log_buf_start_page = virt_to_page(__log_buf); + + printk("Not protecting kernel printk log buffer (%p-%p).\n", + __log_buf, __log_buf + __LOG_BUF_LEN); + + for (i = 0; i < (1 << (CONFIG_LOG_BUF_SHIFT - PAGE_SHIFT)); i++) + SetPageTOI_Untracked(log_buf_start_page + i); +} +#endif + /* Return log buffer address */ char *log_buf_addr_get(void) { @@ -1450,9 +1465,9 @@ !(con->flags & CON_ANYTIME)) continue; if (con->flags & CON_EXTENDED) - con->write(con, ext_text, ext_len); + con->write(con, ext_text, ext_len, level); else - con->write(con, text, len); + con->write(con, text, len, level); } } @@ -1973,7 +1988,7 @@ n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); - early_console->write(early_console, buf, n); + early_console->write(early_console, buf, n, 0); } #endif diff -Nur linux-4.2/kernel/sched/fair.c linux-4.2-pck/kernel/sched/fair.c --- linux-4.2/kernel/sched/fair.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/sched/fair.c 2015-09-08 00:48:22.241696581 -0300 @@ -47,8 +47,13 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned int sysctl_sched_latency = 3000000ULL; +unsigned int normalized_sysctl_sched_latency = 3000000ULL; +#else unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +#endif /* * The initial- and re-scaling of tunables is configurable @@ -66,13 +71,22 @@ * Minimal preemption granularity for CPU-bound tasks: * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned int sysctl_sched_min_granularity = 300000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; +#else unsigned int sysctl_sched_min_granularity = 750000ULL; unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +#endif /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ +#ifdef CONFIG_PCK_INTERACTIVE +static unsigned int sched_nr_latency = 10; +#else static unsigned int sched_nr_latency = 8; +#endif /* * After fork, child runs first. If set to 0 (default) then @@ -88,10 +102,17 @@ * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned int sysctl_sched_wakeup_granularity = 500000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; + +const_debug unsigned int sysctl_sched_migration_cost = 250000UL; +#else unsigned int sysctl_sched_wakeup_granularity = 1000000UL; unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#endif /* * The exponential sliding window over which load is averaged for shares @@ -111,8 +132,12 @@ * * default: 5 msec, units: microseconds */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; +#else unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +#endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) { diff -Nur linux-4.2/kernel/smpboot.c linux-4.2-pck/kernel/smpboot.c --- linux-4.2/kernel/smpboot.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/smpboot.c 2015-09-08 11:20:32.507010024 -0300 @@ -173,7 +173,7 @@ if (tsk) return 0; - td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); + td = kzalloc_node(sizeof(*td), GFP_KERNEL | ___GFP_TOI_NOTRACK, cpu_to_node(cpu)); if (!td) return -ENOMEM; td->cpu = cpu; diff -Nur linux-4.2/kernel/workqueue.c linux-4.2-pck/kernel/workqueue.c --- linux-4.2/kernel/workqueue.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/kernel/workqueue.c 2015-09-08 00:48:22.241696581 -0300 @@ -2614,7 +2614,7 @@ out_unlock: mutex_unlock(&wq->mutex); } -EXPORT_SYMBOL_GPL(flush_workqueue); +EXPORT_SYMBOL(flush_workqueue); /** * drain_workqueue - drain a workqueue diff -Nur linux-4.2/lib/Makefile linux-4.2-pck/lib/Makefile --- linux-4.2/lib/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/lib/Makefile 2015-09-08 00:51:03.460668141 -0300 @@ -8,7 +8,7 @@ endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o dump_stack.o timerqueue.o\ + rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ sha1.o md5.o irq_regs.o argv_split.o \ proportions.o flex_proportions.o ratelimit.o show_mem.o \ diff -Nur linux-4.2/lib/sradix-tree.c linux-4.2-pck/lib/sradix-tree.c --- linux-4.2/lib/sradix-tree.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/lib/sradix-tree.c 2015-09-08 00:51:03.460668141 -0300 @@ -0,0 +1,476 @@ +#include +#include +#include +#include +#include +#include +#include + +static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) +{ + return node->fulls == root->stores_size || + (node->height == 1 && node->count == root->stores_size); +} + +/* + * Extend a sradix tree so it can store key @index. + */ +static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) +{ + struct sradix_tree_node *node; + unsigned int height; + + if (unlikely(root->rnode == NULL)) { + if (!(node = root->alloc())) + return -ENOMEM; + + node->height = 1; + root->rnode = node; + root->height = 1; + } + + /* Figure out what the height should be. */ + height = root->height; + index >>= root->shift * height; + + while (index) { + index >>= root->shift; + height++; + } + + while (height > root->height) { + unsigned int newheight; + if (!(node = root->alloc())) + return -ENOMEM; + + /* Increase the height. */ + node->stores[0] = root->rnode; + root->rnode->parent = node; + if (root->extend) + root->extend(node, root->rnode); + + newheight = root->height + 1; + node->height = newheight; + node->count = 1; + if (sradix_node_full(root, root->rnode)) + node->fulls = 1; + + root->rnode = node; + root->height = newheight; + } + + return 0; +} + +/* + * Search the next item from the current node, that is not NULL + * and can satify root->iter(). + */ +void *sradix_tree_next(struct sradix_tree_root *root, + struct sradix_tree_node *node, unsigned long index, + int (*iter)(void *item, unsigned long height)) +{ + unsigned long offset; + void *item; + + if (unlikely(node == NULL)) { + node = root->rnode; + for (offset = 0; offset < root->stores_size; offset++) { + item = node->stores[offset]; + if (item && (!iter || iter(item, node->height))) + break; + } + + if (unlikely(offset >= root->stores_size)) + return NULL; + + if (node->height == 1) + return item; + else + goto go_down; + } + + while (node) { + offset = (index & root->mask) + 1; + for (;offset < root->stores_size; offset++) { + item = node->stores[offset]; + if (item && (!iter || iter(item, node->height))) + break; + } + + if (offset < root->stores_size) + break; + + node = node->parent; + index >>= root->shift; + } + + if (!node) + return NULL; + + while (node->height > 1) { +go_down: + node = item; + for (offset = 0; offset < root->stores_size; offset++) { + item = node->stores[offset]; + if (item && (!iter || iter(item, node->height))) + break; + } + + if (unlikely(offset >= root->stores_size)) + return NULL; + } + + BUG_ON(offset > root->stores_size); + + return item; +} + +/* + * Blindly insert the item to the tree. Typically, we reuse the + * first empty store item. + */ +int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) +{ + unsigned long index; + unsigned int height; + struct sradix_tree_node *node, *tmp = NULL; + int offset, offset_saved; + void **store = NULL; + int error, i, j, shift; + +go_on: + index = root->min; + + if (root->enter_node && !sradix_node_full(root, root->enter_node)) { + node = root->enter_node; + BUG_ON((index >> (root->shift * root->height))); + } else { + node = root->rnode; + if (node == NULL || (index >> (root->shift * root->height)) + || sradix_node_full(root, node)) { + error = sradix_tree_extend(root, index); + if (error) + return error; + + node = root->rnode; + } + } + + + height = node->height; + shift = (height - 1) * root->shift; + offset = (index >> shift) & root->mask; + while (shift > 0) { + offset_saved = offset; + for (; offset < root->stores_size; offset++) { + store = &node->stores[offset]; + tmp = *store; + + if (!tmp || !sradix_node_full(root, tmp)) + break; + } + BUG_ON(offset >= root->stores_size); + + if (offset != offset_saved) { + index += (offset - offset_saved) << shift; + index &= ~((1UL << shift) - 1); + } + + if (!tmp) { + if (!(tmp = root->alloc())) + return -ENOMEM; + + tmp->height = shift / root->shift; + *store = tmp; + tmp->parent = node; + node->count++; +// if (root->extend) +// root->extend(node, tmp); + } + + node = tmp; + shift -= root->shift; + offset = (index >> shift) & root->mask; + } + + BUG_ON(node->height != 1); + + + store = &node->stores[offset]; + for (i = 0, j = 0; + j < root->stores_size - node->count && + i < root->stores_size - offset && j < num; i++) { + if (!store[i]) { + store[i] = item[j]; + if (root->assign) + root->assign(node, index + i, item[j]); + j++; + } + } + + node->count += j; + root->num += j; + num -= j; + + while (sradix_node_full(root, node)) { + node = node->parent; + if (!node) + break; + + node->fulls++; + } + + if (unlikely(!node)) { + /* All nodes are full */ + root->min = 1 << (root->height * root->shift); + root->enter_node = NULL; + } else { + root->min = index + i - 1; + root->min |= (1UL << (node->height - 1)) - 1; + root->min++; + root->enter_node = node; + } + + if (num) { + item += j; + goto go_on; + } + + return 0; +} + + +/** + * sradix_tree_shrink - shrink height of a sradix tree to minimal + * @root sradix tree root + * + */ +static inline void sradix_tree_shrink(struct sradix_tree_root *root) +{ + /* try to shrink tree height */ + while (root->height > 1) { + struct sradix_tree_node *to_free = root->rnode; + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost store, we cannot shrink. + */ + if (to_free->count != 1 || !to_free->stores[0]) + break; + + root->rnode = to_free->stores[0]; + root->rnode->parent = NULL; + root->height--; + if (unlikely(root->enter_node == to_free)) { + root->enter_node = NULL; + } + root->free(to_free); + } +} + +/* + * Del the item on the known leaf node and index + */ +void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, + struct sradix_tree_node *node, unsigned long index) +{ + unsigned int offset; + struct sradix_tree_node *start, *end; + + BUG_ON(node->height != 1); + + start = node; + while (node && !(--node->count)) + node = node->parent; + + end = node; + if (!node) { + root->rnode = NULL; + root->height = 0; + root->min = 0; + root->num = 0; + root->enter_node = NULL; + } else { + offset = (index >> (root->shift * (node->height - 1))) & root->mask; + if (root->rm) + root->rm(node, offset); + node->stores[offset] = NULL; + root->num--; + if (root->min > index) { + root->min = index; + root->enter_node = node; + } + } + + if (start != end) { + do { + node = start; + start = start->parent; + if (unlikely(root->enter_node == node)) + root->enter_node = end; + root->free(node); + } while (start != end); + + /* + * Note that shrink may free "end", so enter_node still need to + * be checked inside. + */ + sradix_tree_shrink(root); + } else if (node->count == root->stores_size - 1) { + /* It WAS a full leaf node. Update the ancestors */ + node = node->parent; + while (node) { + node->fulls--; + if (node->fulls != root->stores_size - 1) + break; + + node = node->parent; + } + } +} + +void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) +{ + unsigned int height, offset; + struct sradix_tree_node *node; + int shift; + + node = root->rnode; + if (node == NULL || (index >> (root->shift * root->height))) + return NULL; + + height = root->height; + shift = (height - 1) * root->shift; + + do { + offset = (index >> shift) & root->mask; + node = node->stores[offset]; + if (!node) + return NULL; + + shift -= root->shift; + } while (shift >= 0); + + return node; +} + +/* + * Return the item if it exists, otherwise create it in place + * and return the created item. + */ +void *sradix_tree_lookup_create(struct sradix_tree_root *root, + unsigned long index, void *(*item_alloc)(void)) +{ + unsigned int height, offset; + struct sradix_tree_node *node, *tmp; + void *item; + int shift, error; + + if (root->rnode == NULL || (index >> (root->shift * root->height))) { + if (item_alloc) { + error = sradix_tree_extend(root, index); + if (error) + return NULL; + } else { + return NULL; + } + } + + node = root->rnode; + height = root->height; + shift = (height - 1) * root->shift; + + do { + offset = (index >> shift) & root->mask; + if (!node->stores[offset]) { + if (!(tmp = root->alloc())) + return NULL; + + tmp->height = shift / root->shift; + node->stores[offset] = tmp; + tmp->parent = node; + node->count++; + node = tmp; + } else { + node = node->stores[offset]; + } + + shift -= root->shift; + } while (shift > 0); + + BUG_ON(node->height != 1); + offset = index & root->mask; + if (node->stores[offset]) { + return node->stores[offset]; + } else if (item_alloc) { + if (!(item = item_alloc())) + return NULL; + + node->stores[offset] = item; + + /* + * NOTE: we do NOT call root->assign here, since this item is + * newly created by us having no meaning. Caller can call this + * if it's necessary to do so. + */ + + node->count++; + root->num++; + + while (sradix_node_full(root, node)) { + node = node->parent; + if (!node) + break; + + node->fulls++; + } + + if (unlikely(!node)) { + /* All nodes are full */ + root->min = 1 << (root->height * root->shift); + } else { + if (root->min == index) { + root->min |= (1UL << (node->height - 1)) - 1; + root->min++; + root->enter_node = node; + } + } + + return item; + } else { + return NULL; + } + +} + +int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) +{ + unsigned int height, offset; + struct sradix_tree_node *node; + int shift; + + node = root->rnode; + if (node == NULL || (index >> (root->shift * root->height))) + return -ENOENT; + + height = root->height; + shift = (height - 1) * root->shift; + + do { + offset = (index >> shift) & root->mask; + node = node->stores[offset]; + if (!node) + return -ENOENT; + + shift -= root->shift; + } while (shift > 0); + + offset = index & root->mask; + if (!node->stores[offset]) + return -ENOENT; + + sradix_tree_delete_from_leaf(root, node, index); + + return 0; +} diff -Nur linux-4.2/mm/Kconfig linux-4.2-pck/mm/Kconfig --- linux-4.2/mm/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/Kconfig 2015-09-08 00:51:03.460668141 -0300 @@ -340,6 +340,32 @@ See Documentation/vm/ksm.txt for more information: KSM is inactive until a program has madvised that an area is MADV_MERGEABLE, and root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). +choice + prompt "Choose UKSM/KSM strategy" + default UKSM + depends on KSM + help + This option allows to select a UKSM/KSM stragety. + +config UKSM + bool "Ultra-KSM for page merging" + depends on KSM + help + UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same + page Merging), but with a fundamentally rewritten core algorithm. With + an advanced algorithm, UKSM now can transparently scans all anonymously + mapped user space applications with an significantly improved scan speed + and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from + UKSM. Now UKSM has its first stable release and first real world enterprise user. + For more information, please goto its project page. + (www.kerneldedup.org) + +config KSM_LEGACY + bool "Legacy KSM implementation" + depends on KSM + help + The legacy KSM implementation from Redhat. +endchoice config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" diff -Nur linux-4.2/mm/Makefile linux-4.2-pck/mm/Makefile --- linux-4.2/mm/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/Makefile 2015-09-08 00:51:03.464001315 -0300 @@ -47,7 +47,8 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o -obj-$(CONFIG_KSM) += ksm.o +obj-$(CONFIG_KSM_LEGACY) += ksm.o +obj-$(CONFIG_UKSM) += uksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o diff -Nur linux-4.2/mm/debug.c linux-4.2-pck/mm/debug.c --- linux-4.2/mm/debug.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/debug.c 2015-09-08 11:20:32.533675385 -0300 @@ -48,6 +48,12 @@ #ifdef CONFIG_TRANSPARENT_HUGEPAGE {1UL << PG_compound_lock, "compound_lock" }, #endif +#ifdef CONFIG_TOI_INCREMENTAL + {1UL << PG_toi_untracked, "toi_untracked" }, + {1UL << PG_toi_ro, "toi_ro" }, + {1UL << PG_toi_cbw, "toi_cbw" }, + {1UL << PG_toi_dirty, "toi_dirty" }, +#endif }; static void dump_flags(unsigned long flags, diff -Nur linux-4.2/mm/memory.c linux-4.2-pck/mm/memory.c --- linux-4.2/mm/memory.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/memory.c 2015-09-08 00:53:25.270528186 -0300 @@ -120,6 +120,28 @@ EXPORT_SYMBOL(zero_pfn); +#ifdef CONFIG_UKSM +unsigned long uksm_zero_pfn __read_mostly; +EXPORT_SYMBOL_GPL(uksm_zero_pfn); +struct page *empty_uksm_zero_page; + +static int __init setup_uksm_zero_page(void) +{ + unsigned long addr; + addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0); + if (!addr) + panic("Oh boy, that early out of memory?"); + + empty_uksm_zero_page = virt_to_page((void *) addr); + SetPageReserved(empty_uksm_zero_page); + + uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); + + return 0; +} +core_initcall(setup_uksm_zero_page); +#endif + /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ @@ -131,6 +153,7 @@ core_initcall(init_zero_pfn); + #if defined(SPLIT_RSS_COUNTING) void sync_mm_rss(struct mm_struct *mm) @@ -877,6 +900,11 @@ rss[MM_ANONPAGES]++; else rss[MM_FILEPAGES]++; + + /* Should return NULL in vm_normal_page() */ + uksm_bugon_zeropage(pte); + } else { + uksm_map_zero_page(pte); } out_set_pte: @@ -1110,8 +1138,10 @@ ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); - if (unlikely(!page)) + if (unlikely(!page)) { + uksm_unmap_zero_page(ptent); continue; + } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { @@ -1944,8 +1974,10 @@ clear_page(kaddr); kunmap_atomic(kaddr); flush_dcache_page(dst); - } else + } else { copy_user_highpage(dst, src, va, vma); + uksm_cow_page(vma, src); + } } /* @@ -2075,6 +2107,7 @@ new_page = alloc_zeroed_user_highpage_movable(vma, address); if (!new_page) goto oom; + uksm_cow_pte(vma, orig_pte); } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) @@ -2099,7 +2132,9 @@ dec_mm_counter_fast(mm, MM_FILEPAGES); inc_mm_counter_fast(mm, MM_ANONPAGES); } + uksm_bugon_zeropage(orig_pte); } else { + uksm_unmap_zero_page(orig_pte); inc_mm_counter_fast(mm, MM_ANONPAGES); } flush_cache_page(vma, address, pte_pfn(orig_pte)); diff -Nur linux-4.2/mm/mmap.c linux-4.2-pck/mm/mmap.c --- linux-4.2/mm/mmap.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/mmap.c 2015-09-08 00:51:03.464001315 -0300 @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -276,6 +277,7 @@ if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); + uksm_remove_vma(vma); kmem_cache_free(vm_area_cachep, vma); return next; } @@ -733,9 +735,16 @@ long adjust_next = 0; int remove_next = 0; +/* + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is + * acquired + */ + uksm_remove_vma(vma); + if (next && !insert) { struct vm_area_struct *exporter = NULL; + uksm_remove_vma(next); if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -829,6 +838,7 @@ end_changed = true; } vma->vm_pgoff = pgoff; + if (adjust_next) { next->vm_start += adjust_next << PAGE_SHIFT; next->vm_pgoff += adjust_next; @@ -899,16 +909,22 @@ * up the code too much to do both in one go. */ next = vma->vm_next; - if (remove_next == 2) + if (remove_next == 2) { + uksm_remove_vma(next); goto again; - else if (next) + } else if (next) { vma_gap_update(next); - else + } else { mm->highest_vm_end = end; + } + } else { + if (next && !insert) + uksm_vma_add_new(next); } if (insert && file) uprobe_mmap(insert); + uksm_vma_add_new(vma); validate_mm(mm); return 0; @@ -1301,6 +1317,9 @@ vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */ + uksm_vm_flags_mod(&vm_flags); + if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; @@ -1642,6 +1661,7 @@ allow_write_access(file); } file = vma->vm_file; + uksm_vma_add_new(vma); out: perf_event_mmap(vma); @@ -1683,6 +1703,7 @@ if (vm_flags & VM_DENYWRITE) allow_write_access(file); free_vma: + uksm_remove_vma(vma); kmem_cache_free(vm_area_cachep, vma); unacct_error: if (charged) @@ -2484,6 +2505,8 @@ else err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + uksm_vma_add_new(new); + /* Success. */ if (!err) return 0; @@ -2721,6 +2744,7 @@ return addr; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + uksm_vm_flags_mod(&flags); error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (error & ~PAGE_MASK) @@ -2778,6 +2802,7 @@ vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); vma_link(mm, vma, prev, rb_link, rb_parent); + uksm_vma_add_new(vma); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -2813,6 +2838,12 @@ /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); + /* + * Taking write lock on mmap_sem does not harm others, + * but it's crucial for uksm to avoid races. + */ + down_write(&mm->mmap_sem); + if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -2848,6 +2879,11 @@ vma = remove_vma(vma); } vm_unacct_memory(nr_accounted); + + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + vmacache_invalidate(mm); + up_write(&mm->mmap_sem); } /* Insert vm structure into process list sorted by address @@ -2954,6 +2990,7 @@ new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); *need_rmap_locks = false; + uksm_vma_add_new(new_vma); } } return new_vma; @@ -3067,10 +3104,10 @@ ret = insert_vm_struct(mm, vma); if (ret) goto out; - mm->total_vm += len >> PAGE_SHIFT; perf_event_mmap(vma); + uksm_vma_add_new(vma); return vma; diff -Nur linux-4.2/mm/page-writeback.c linux-4.2-pck/mm/page-writeback.c --- linux-4.2/mm/page-writeback.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/page-writeback.c 2015-09-08 00:48:22.241696581 -0300 @@ -70,13 +70,21 @@ /* * Start background writeback (via writeback threads) at this percentage */ +#ifdef CONFIG_PCK_INTERACTIVE +int dirty_background_ratio; +#else int dirty_background_ratio = 10; +#endif /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned long dirty_background_bytes = 128 * 1024 * 1024; +#else unsigned long dirty_background_bytes; +#endif /* * free highmem will not be subtracted from the total free memory @@ -87,13 +95,21 @@ /* * The generator of dirty data starts writeback at this percentage */ +#ifdef CONFIG_PCK_INTERACTIVE +int vm_dirty_ratio; +#else int vm_dirty_ratio = 20; +#endif /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ +#ifdef CONFIG_PCK_INTERACTIVE +unsigned long vm_dirty_bytes = 256 * 1024 * 1024; +#else unsigned long vm_dirty_bytes; +#endif /* * The interval between `kupdate'-style writebacks diff -Nur linux-4.2/mm/page_alloc.c linux-4.2-pck/mm/page_alloc.c --- linux-4.2/mm/page_alloc.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/page_alloc.c 2015-09-08 11:20:32.543674897 -0300 @@ -61,6 +61,7 @@ #include #include #include +#include #include #include @@ -726,6 +727,12 @@ if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif + if (unlikely(PageTOI_Untracked(page))) { + // Make it writable and included in image if allocated. + ClearPageTOI_Untracked(page); + // If it gets allocated, it will be dirty from TOI's POV. + SetPageTOI_Dirty(page); + } if (unlikely(bad_reason)) { bad_page(page, bad_reason, bad_flags); return 1; @@ -1324,6 +1331,11 @@ struct page *p = page + i; if (unlikely(check_new_page(p))) return 1; + if (unlikely(toi_incremental_support() && gfp_flags & ___GFP_TOI_NOTRACK)) { + // Make the page writable if it's protected, and set it to be untracked. + SetPageTOI_Untracked(p); + toi_make_writable(init_mm.pgd, (unsigned long) page_address(p)); + } } set_page_private(page, 0); diff -Nur linux-4.2/mm/percpu.c linux-4.2-pck/mm/percpu.c --- linux-4.2/mm/percpu.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/percpu.c 2015-09-08 11:20:32.543674897 -0300 @@ -125,6 +125,7 @@ static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; +static int pcpu_pfns; /* cpus with the lowest and highest unit addresses */ static unsigned int pcpu_low_unit_cpu __read_mostly; @@ -1795,6 +1796,7 @@ /* calculate size_sum and ensure dyn_size is enough for early alloc */ size_sum = PFN_ALIGN(static_size + reserved_size + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); + pcpu_pfns = PFN_DOWN(size_sum); dyn_size = size_sum - static_size - reserved_size; /* @@ -2282,6 +2284,22 @@ } } +#ifdef CONFIG_TOI_INCREMENTAL +/* + * It doesn't matter if we mark an extra page as untracked (and therefore + * always save it in incremental images). + */ +void toi_mark_per_cpus_pages_untracked(void) +{ + int i; + + struct page *page = virt_to_page(pcpu_base_addr); + + for (i = 0; i < pcpu_pfns; i++) + SetPageTOI_Untracked(page + i); +} +#endif + /* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up diff -Nur linux-4.2/mm/rmap.c linux-4.2-pck/mm/rmap.c --- linux-4.2/mm/rmap.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/rmap.c 2015-09-08 00:51:03.464001315 -0300 @@ -962,9 +962,9 @@ /** * __page_set_anon_rmap - set up new anonymous rmap - * @page: Page to add to rmap + * @page: Page to add to rmap * @vma: VM area to add page to. - * @address: User virtual address of the mapping + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct page *page, diff -Nur linux-4.2/mm/shmem.c linux-4.2-pck/mm/shmem.c --- linux-4.2/mm/shmem.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/shmem.c 2015-09-08 11:20:32.543674897 -0300 @@ -1396,7 +1396,7 @@ } static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) + umode_t mode, dev_t dev, unsigned long flags, int atomic_copy) { struct inode *inode; struct shmem_inode_info *info; @@ -1417,6 +1417,8 @@ spin_lock_init(&info->lock); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; + if (atomic_copy) + inode->i_flags |= S_ATOMIC_COPY; INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); @@ -2206,7 +2208,7 @@ struct inode *inode; int error = -ENOSPC; - inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); + inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE, 0); if (inode) { error = simple_acl_create(dir, inode); if (error) @@ -2235,7 +2237,7 @@ struct inode *inode; int error = -ENOSPC; - inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); + inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE, 0); if (inode) { error = security_inode_init_security(inode, dir, NULL, @@ -2428,7 +2430,7 @@ if (len > PAGE_CACHE_SIZE) return -ENAMETOOLONG; - inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); + inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE, 0); if (!inode) return -ENOSPC; @@ -2946,7 +2948,7 @@ goto err_name; } - file = shmem_file_setup(name, 0, VM_NORESERVE); + file = shmem_file_setup(name, 0, VM_NORESERVE, 0); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_fd; @@ -3037,7 +3039,7 @@ sb->s_flags |= MS_POSIXACL; #endif - inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); + inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE, 0); if (!inode) goto failed; inode->i_uid = sbinfo->uid; @@ -3291,7 +3293,7 @@ #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations -#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) +#define shmem_get_inode(sb, dir, mode, dev, flags, atomic_copy) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) @@ -3304,7 +3306,8 @@ }; static struct file *__shmem_file_setup(const char *name, loff_t size, - unsigned long flags, unsigned int i_flags) + unsigned long flags, unsigned int i_flags, + int atomic_copy) { struct file *res; struct inode *inode; @@ -3333,7 +3336,7 @@ d_set_d_op(path.dentry, &anon_ops); res = ERR_PTR(-ENOSPC); - inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); + inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags, atomic_copy); if (!inode) goto put_memory; @@ -3369,9 +3372,9 @@ * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ -struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) +struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags, int atomic_copy) { - return __shmem_file_setup(name, size, flags, S_PRIVATE); + return __shmem_file_setup(name, size, flags, S_PRIVATE, atomic_copy); } /** @@ -3380,9 +3383,9 @@ * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ -struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags, int atomic_copy) { - return __shmem_file_setup(name, size, flags, 0); + return __shmem_file_setup(name, size, flags, 0, atomic_copy); } EXPORT_SYMBOL_GPL(shmem_file_setup); @@ -3401,7 +3404,7 @@ * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ - file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE); + file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE, 0); if (IS_ERR(file)) return PTR_ERR(file); diff -Nur linux-4.2/mm/slub.c linux-4.2-pck/mm/slub.c --- linux-4.2/mm/slub.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/slub.c 2015-09-08 11:20:32.547008068 -0300 @@ -1315,7 +1315,7 @@ struct page *page; int order = oo_order(oo); - flags |= __GFP_NOTRACK; + flags |= (__GFP_NOTRACK | ___GFP_TOI_NOTRACK); if (memcg_charge_slab(s, flags, order)) return NULL; @@ -3331,7 +3331,7 @@ struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK; + flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_TOI_NOTRACK; page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); diff -Nur linux-4.2/mm/swapfile.c linux-4.2-pck/mm/swapfile.c --- linux-4.2/mm/swapfile.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/swapfile.c 2015-09-08 11:20:32.550341239 -0300 @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -43,7 +44,6 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static sector_t map_swap_entry(swp_entry_t, struct block_device**); DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -722,6 +722,60 @@ return (swp_entry_t) {0}; } +static unsigned int find_next_to_unuse(struct swap_info_struct *si, + unsigned int prev, bool frontswap); + +void get_swap_range_of_type(int type, swp_entry_t *start, swp_entry_t *end, + unsigned int limit) +{ + struct swap_info_struct *si; + pgoff_t start_at; + unsigned int i; + + *start = swp_entry(0, 0); + *end = swp_entry(0, 0); + si = swap_info[type]; + spin_lock(&si->lock); + if (si && (si->flags & SWP_WRITEOK)) { + atomic_long_dec(&nr_swap_pages); + /* This is called for allocating swap entry, not cache */ + start_at = scan_swap_map(si, 1); + if (start_at) { + unsigned long stop_at = find_next_to_unuse(si, start_at, 0); + if (stop_at > start_at) + stop_at--; + else + stop_at = si->max - 1; + if (stop_at - start_at + 1 > limit) + stop_at = min_t(unsigned int, + start_at + limit - 1, + si->max - 1); + /* Mark them used */ + for (i = start_at; i <= stop_at; i++) + si->swap_map[i] = 1; + /* first page already done above */ + si->inuse_pages += stop_at - start_at; + + atomic_long_sub(stop_at - start_at, &nr_swap_pages); + if (start_at == si->lowest_bit) + si->lowest_bit = stop_at + 1; + if (stop_at == si->highest_bit) + si->highest_bit = start_at - 1; + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; + } + for (i = start_at + 1; i <= stop_at; i++) + inc_cluster_info_page(si, si->cluster_info, i); + si->cluster_next = stop_at + 1; + *start = swp_entry(type, start_at); + *end = swp_entry(type, stop_at); + } else + atomic_long_inc(&nr_swap_pages); + } + spin_unlock(&si->lock); +} + static struct swap_info_struct *swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; @@ -1576,7 +1630,7 @@ * Note that the type of this function is sector_t, but it returns page offset * into the bdev, not sector offset. */ -static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) +sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) { struct swap_info_struct *sis; struct swap_extent *start_se; @@ -2721,8 +2775,14 @@ VM_BUG_ON_PAGE(!PageSwapCache(page), page); return swp_offset(swap); } + EXPORT_SYMBOL_GPL(__page_file_index); +struct swap_info_struct *get_swap_info_struct(unsigned type) +{ + return swap_info[type]; +} + /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's diff -Nur linux-4.2/mm/uksm.c linux-4.2-pck/mm/uksm.c --- linux-4.2/mm/uksm.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/mm/uksm.c 2015-09-08 00:51:03.470667662 -0300 @@ -0,0 +1,5525 @@ +/* + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia + * + * This is an improvement upon KSM. Some basic data structures and routines + * are borrowed from ksm.c . + * + * Its new features: + * 1. Full system scan: + * It automatically scans all user processes' anonymous VMAs. Kernel-user + * interaction to submit a memory area to KSM is no longer needed. + * + * 2. Rich area detection: + * It automatically detects rich areas containing abundant duplicated + * pages based. Rich areas are given a full scan speed. Poor areas are + * sampled at a reasonable speed with very low CPU consumption. + * + * 3. Ultra Per-page scan speed improvement: + * A new hash algorithm is proposed. As a result, on a machine with + * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it + * can scan memory areas that does not contain duplicated pages at speed of + * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of + * 477MB/sec ~ 923MB/sec. + * + * 4. Thrashing area avoidance: + * Thrashing area(an VMA that has frequent Ksm page break-out) can be + * filtered out. My benchmark shows it's more efficient than KSM's per-page + * hash value based volatile page detection. + * + * + * 5. Misc changes upon KSM: + * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page + * comparison. It's much faster than default C version on x86. + * * rmap_item now has an struct *page member to loosely cache a + * address-->page mapping, which reduces too much time-costly + * follow_page(). + * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. + * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ + * ksm is needed for this case. + * + * 6. Full Zero Page consideration(contributed by Figo Zhang) + * Now uksmd consider full zero pages as special pages and merge them to an + * special unswappable uksm zero page. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "internal.h" + +#ifdef CONFIG_X86 +#undef memcmp + +#ifdef CONFIG_X86_32 +#define memcmp memcmpx86_32 +/* + * Compare 4-byte-aligned address s1 and s2, with length n + */ +int memcmpx86_32(void *s1, void *s2, size_t n) +{ + size_t num = n / 4; + register int res; + + __asm__ __volatile__ + ( + "testl %3,%3\n\t" + "repe; cmpsd\n\t" + "je 1f\n\t" + "sbbl %0,%0\n\t" + "orl $1,%0\n" + "1:" + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) + : "0" (0) + : "cc"); + + return res; +} + +/* + * Check the page is all zero ? + */ +static int is_full_zero(const void *s1, size_t len) +{ + unsigned char same; + + len /= 4; + + __asm__ __volatile__ + ("repe; scasl;" + "sete %0" + : "=qm" (same), "+D" (s1), "+c" (len) + : "a" (0) + : "cc"); + + return same; +} + + +#elif defined(CONFIG_X86_64) +#define memcmp memcmpx86_64 +/* + * Compare 8-byte-aligned address s1 and s2, with length n + */ +int memcmpx86_64(void *s1, void *s2, size_t n) +{ + size_t num = n / 8; + register int res; + + __asm__ __volatile__ + ( + "testq %q3,%q3\n\t" + "repe; cmpsq\n\t" + "je 1f\n\t" + "sbbq %q0,%q0\n\t" + "orq $1,%q0\n" + "1:" + : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) + : "0" (0) + : "cc"); + + return res; +} + +static int is_full_zero(const void *s1, size_t len) +{ + unsigned char same; + + len /= 8; + + __asm__ __volatile__ + ("repe; scasq;" + "sete %0" + : "=qm" (same), "+D" (s1), "+c" (len) + : "a" (0) + : "cc"); + + return same; +} + +#endif +#else +static int is_full_zero(const void *s1, size_t len) +{ + unsigned long *src = s1; + int i; + + len /= sizeof(*src); + + for (i = 0; i < len; i++) { + if (src[i]) + return 0; + } + + return 1; +} +#endif + +#define UKSM_RUNG_ROUND_FINISHED (1 << 0) +#define TIME_RATIO_SCALE 10000 + +#define SLOT_TREE_NODE_SHIFT 8 +#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) +struct slot_tree_node { + unsigned long size; + struct sradix_tree_node snode; + void *stores[SLOT_TREE_NODE_STORE_SIZE]; +}; + +static struct kmem_cache *slot_tree_node_cachep; + +static struct sradix_tree_node *slot_tree_node_alloc(void) +{ + struct slot_tree_node *p; + p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL); + if (!p) + return NULL; + + return &p->snode; +} + +static void slot_tree_node_free(struct sradix_tree_node *node) +{ + struct slot_tree_node *p; + + p = container_of(node, struct slot_tree_node, snode); + kmem_cache_free(slot_tree_node_cachep, p); +} + +static void slot_tree_node_extend(struct sradix_tree_node *parent, + struct sradix_tree_node *child) +{ + struct slot_tree_node *p, *c; + + p = container_of(parent, struct slot_tree_node, snode); + c = container_of(child, struct slot_tree_node, snode); + + p->size += c->size; +} + +void slot_tree_node_assign(struct sradix_tree_node *node, + unsigned index, void *item) +{ + struct vma_slot *slot = item; + struct slot_tree_node *cur; + + slot->snode = node; + slot->sindex = index; + + while (node) { + cur = container_of(node, struct slot_tree_node, snode); + cur->size += slot->pages; + node = node->parent; + } +} + +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset) +{ + struct vma_slot *slot; + struct slot_tree_node *cur; + unsigned long pages; + + if (node->height == 1) { + slot = node->stores[offset]; + pages = slot->pages; + } else { + cur = container_of(node->stores[offset], + struct slot_tree_node, snode); + pages = cur->size; + } + + while (node) { + cur = container_of(node, struct slot_tree_node, snode); + cur->size -= pages; + node = node->parent; + } +} + +unsigned long slot_iter_index; +int slot_iter(void *item, unsigned long height) +{ + struct slot_tree_node *node; + struct vma_slot *slot; + + if (height == 1) { + slot = item; + if (slot_iter_index < slot->pages) { + /*in this one*/ + return 1; + } else { + slot_iter_index -= slot->pages; + return 0; + } + + } else { + node = container_of(item, struct slot_tree_node, snode); + if (slot_iter_index < node->size) { + /*in this one*/ + return 1; + } else { + slot_iter_index -= node->size; + return 0; + } + } +} + + +static inline void slot_tree_init_root(struct sradix_tree_root *root) +{ + init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); + root->alloc = slot_tree_node_alloc; + root->free = slot_tree_node_free; + root->extend = slot_tree_node_extend; + root->assign = slot_tree_node_assign; + root->rm = slot_tree_node_rm; +} + +void slot_tree_init(void) +{ + slot_tree_node_cachep = kmem_cache_create("slot_tree_node", + sizeof(struct slot_tree_node), 0, + SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, + NULL); +} + + +/* Each rung of this ladder is a list of VMAs having a same scan ratio */ +struct scan_rung { + //struct list_head scanned_list; + struct sradix_tree_root vma_root; + struct sradix_tree_root vma_root2; + + struct vma_slot *current_scan; + unsigned long current_offset; + + /* + * The initial value for current_offset, it should loop over + * [0~ step - 1] to let all slot have its chance to be scanned. + */ + unsigned long offset_init; + unsigned long step; /* dynamic step for current_offset */ + unsigned int flags; + unsigned long pages_to_scan; + //unsigned long fully_scanned_slots; + /* + * a little bit tricky - if cpu_time_ratio > 0, then the value is the + * the cpu time ratio it can spend in rung_i for every scan + * period. if < 0, then it is the cpu time ratio relative to the + * max cpu percentage user specified. Both in unit of + * 1/TIME_RATIO_SCALE + */ + int cpu_ratio; + + /* + * How long it will take for all slots in this rung to be fully + * scanned? If it's zero, we don't care about the cover time: + * it's fully scanned. + */ + unsigned int cover_msecs; + //unsigned long vma_num; + //unsigned long pages; /* Sum of all slot's pages in rung */ +}; + +/** + * node of either the stable or unstale rbtree + * + */ +struct tree_node { + struct rb_node node; /* link in the main (un)stable rbtree */ + struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ + u32 hash; + unsigned long count; /* TODO: merged with sub_root */ + struct list_head all_list; /* all tree nodes in stable/unstable tree */ +}; + +/** + * struct stable_node - node of the stable rbtree + * @node: rb node of this ksm page in the stable tree + * @hlist: hlist head of rmap_items using this ksm page + * @kpfn: page frame number of this ksm page + */ +struct stable_node { + struct rb_node node; /* link in sub-rbtree */ + struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ + struct hlist_head hlist; + unsigned long kpfn; + u32 hash_max; /* if ==0 then it's not been calculated yet */ + struct list_head all_list; /* in a list for all stable nodes */ +}; + +/** + * struct node_vma - group rmap_items linked in a same stable + * node together. + */ +struct node_vma { + union { + struct vma_slot *slot; + unsigned long key; /* slot is used as key sorted on hlist */ + }; + struct hlist_node hlist; + struct hlist_head rmap_hlist; + struct stable_node *head; +}; + +/** + * struct rmap_item - reverse mapping item for virtual addresses + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree + * @mm: the memory structure this rmap_item is pointing into + * @address: the virtual address this rmap_item tracks (+ flags in low bits) + * @node: rb node of this rmap_item in the unstable tree + * @head: pointer to stable_node heading this list in the stable tree + * @hlist: link into hlist of rmap_items hanging off that stable_node + */ +struct rmap_item { + struct vma_slot *slot; + struct page *page; + unsigned long address; /* + low bits used for flags below */ + unsigned long hash_round; + unsigned long entry_index; + union { + struct {/* when in unstable tree */ + struct rb_node node; + struct tree_node *tree_node; + u32 hash_max; + }; + struct { /* when in stable tree */ + struct node_vma *head; + struct hlist_node hlist; + struct anon_vma *anon_vma; + }; + }; +} __attribute__((aligned(4))); + +struct rmap_list_entry { + union { + struct rmap_item *item; + unsigned long addr; + }; + /* lowest bit is used for is_addr tag */ +} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/ + + +/* Basic data structure definition ends */ + + +/* + * Flags for rmap_item to judge if it's listed in the stable/unstable tree. + * The flags use the low bits of rmap_item.address + */ +#define UNSTABLE_FLAG 0x1 +#define STABLE_FLAG 0x2 +#define get_rmap_addr(x) ((x)->address & PAGE_MASK) + +/* + * rmap_list_entry helpers + */ +#define IS_ADDR_FLAG 1 +#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) +#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) +#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) + + +/* + * High speed caches for frequently allocated and freed structs + */ +static struct kmem_cache *rmap_item_cache; +static struct kmem_cache *stable_node_cache; +static struct kmem_cache *node_vma_cache; +static struct kmem_cache *vma_slot_cache; +static struct kmem_cache *tree_node_cache; +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ + sizeof(struct __struct), __alignof__(struct __struct),\ + (__flags), NULL) + +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ +#define SCAN_LADDER_SIZE 4 +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; + +/* The evaluation rounds uksmd has finished */ +static unsigned long long uksm_eval_round = 1; + +/* + * we add 1 to this var when we consider we should rebuild the whole + * unstable tree. + */ +static unsigned long uksm_hash_round = 1; + +/* + * How many times the whole memory is scanned. + */ +static unsigned long long fully_scanned_round = 1; + +/* The total number of virtual pages of all vma slots */ +static u64 uksm_pages_total; + +/* The number of pages has been scanned since the start up */ +static u64 uksm_pages_scanned; + +static u64 scanned_virtual_pages; + +/* The number of pages has been scanned since last encode_benefit call */ +static u64 uksm_pages_scanned_last; + +/* If the scanned number is tooo large, we encode it here */ +static u64 pages_scanned_stored; + +static unsigned long pages_scanned_base; + +/* The number of nodes in the stable tree */ +static unsigned long uksm_pages_shared; + +/* The number of page slots additionally sharing those nodes */ +static unsigned long uksm_pages_sharing; + +/* The number of nodes in the unstable tree */ +static unsigned long uksm_pages_unshared; + +/* + * Milliseconds ksmd should sleep between scans, + * >= 100ms to be consistent with + * scan_time_to_sleep_msec() + */ +static unsigned int uksm_sleep_jiffies; + +/* The real value for the uksmd next sleep */ +static unsigned int uksm_sleep_real; + +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ +static unsigned int uksm_sleep_saved; + +/* Max percentage of cpu utilization ksmd can take to scan in one batch */ +static unsigned int uksm_max_cpu_percentage; + +static int uksm_cpu_governor; + +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; + +struct uksm_cpu_preset_s { + int cpu_ratio[SCAN_LADDER_SIZE]; + unsigned int cover_msecs[SCAN_LADDER_SIZE]; + unsigned int max_cpu; /* percentage */ +}; + +struct uksm_cpu_preset_s uksm_cpu_preset[4] = { + { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, + { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, + { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, + { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, +}; + +/* The default value for uksm_ema_page_time if it's not initialized */ +#define UKSM_PAGE_TIME_DEFAULT 500 + +/*cost to scan one page by expotional moving average in nsecs */ +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; + +/* The expotional moving average alpha weight, in percentage. */ +#define EMA_ALPHA 20 + +/* + * The threshold used to filter out thrashing areas, + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio + * will be considered as having a zero duplication ratio. + */ +static unsigned int uksm_thrash_threshold = 50; + +/* How much dedup ratio is considered to be abundant*/ +static unsigned int uksm_abundant_threshold = 10; + +/* All slots having merged pages in this eval round. */ +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); + +/* How many times the ksmd has slept since startup */ +static unsigned long long uksm_sleep_times; + +#define UKSM_RUN_STOP 0 +#define UKSM_RUN_MERGE 1 +static unsigned int uksm_run = 1; + +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); +static DEFINE_MUTEX(uksm_thread_mutex); + +/* + * List vma_slot_new is for newly created vma_slot waiting to be added by + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding + * VMA has been removed/freed. + */ +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); +static DEFINE_SPINLOCK(vma_slot_list_lock); + +/* The unstable tree heads */ +static struct rb_root root_unstable_tree = RB_ROOT; + +/* + * All tree_nodes are in a list to be freed at once when unstable tree is + * freed after each scan round. + */ +static struct list_head unstable_tree_node_list = + LIST_HEAD_INIT(unstable_tree_node_list); + +/* List contains all stable nodes */ +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); + +/* + * When the hash strength is changed, the stable tree must be delta_hashed and + * re-structured. We use two set of below structs to speed up the + * re-structuring of stable tree. + */ +static struct list_head +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), + LIST_HEAD_INIT(stable_tree_node_list[1])}; + +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; +static struct rb_root *root_stable_treep = &root_stable_tree[0]; +static unsigned long stable_tree_index; + +/* The hash strength needed to hash a full page */ +#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) + +/* The hash strength needed for loop-back hashing */ +#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) + +/* The random offsets in a page */ +static u32 *random_nums; + +/* The hash strength */ +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; + +/* The delta value each time the hash strength increases or decreases */ +static unsigned long hash_strength_delta; +#define HASH_STRENGTH_DELTA_MAX 5 + +/* The time we have saved due to random_sample_hash */ +static u64 rshash_pos; + +/* The time we have wasted due to hash collision */ +static u64 rshash_neg; + +struct uksm_benefit { + u64 pos; + u64 neg; + u64 scanned; + unsigned long base; +} benefit; + +/* + * The relative cost of memcmp, compared to 1 time unit of random sample + * hash, this value is tested when ksm module is initialized + */ +static unsigned long memcmp_cost; + +static unsigned long rshash_neg_cont_zero; +static unsigned long rshash_cont_obscure; + +/* The possible states of hash strength adjustment heuristic */ +enum rshash_states { + RSHASH_STILL, + RSHASH_TRYUP, + RSHASH_TRYDOWN, + RSHASH_NEW, + RSHASH_PRE_STILL, +}; + +/* The possible direction we are about to adjust hash strength */ +enum rshash_direct { + GO_UP, + GO_DOWN, + OBSCURE, + STILL, +}; + +/* random sampling hash state machine */ +static struct { + enum rshash_states state; + enum rshash_direct pre_direct; + u8 below_count; + /* Keep a lookup window of size 5, iff above_count/below_count > 3 + * in this window we stop trying. + */ + u8 lookup_window_index; + u64 stable_benefit; + unsigned long turn_point_down; + unsigned long turn_benefit_down; + unsigned long turn_point_up; + unsigned long turn_benefit_up; + unsigned long stable_point; +} rshash_state; + +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ +static u32 *zero_hash_table; + +static inline struct node_vma *alloc_node_vma(void) +{ + struct node_vma *node_vma; + node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL); + if (node_vma) { + INIT_HLIST_HEAD(&node_vma->rmap_hlist); + INIT_HLIST_NODE(&node_vma->hlist); + } + return node_vma; +} + +static inline void free_node_vma(struct node_vma *node_vma) +{ + kmem_cache_free(node_vma_cache, node_vma); +} + + +static inline struct vma_slot *alloc_vma_slot(void) +{ + struct vma_slot *slot; + + /* + * In case ksm is not initialized by now. + * Oops, we need to consider the call site of uksm_init() in the future. + */ + if (!vma_slot_cache) + return NULL; + + slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL); + if (slot) { + INIT_LIST_HEAD(&slot->slot_list); + INIT_LIST_HEAD(&slot->dedup_list); + slot->flags |= UKSM_SLOT_NEED_RERAND; + } + return slot; +} + +static inline void free_vma_slot(struct vma_slot *vma_slot) +{ + kmem_cache_free(vma_slot_cache, vma_slot); +} + + + +static inline struct rmap_item *alloc_rmap_item(void) +{ + struct rmap_item *rmap_item; + + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + if (rmap_item) { + /* bug on lowest bit is not clear for flag use */ + BUG_ON(is_addr(rmap_item)); + } + return rmap_item; +} + +static inline void free_rmap_item(struct rmap_item *rmap_item) +{ + rmap_item->slot = NULL; /* debug safety */ + kmem_cache_free(rmap_item_cache, rmap_item); +} + +static inline struct stable_node *alloc_stable_node(void) +{ + struct stable_node *node; + node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC); + if (!node) + return NULL; + + INIT_HLIST_HEAD(&node->hlist); + list_add(&node->all_list, &stable_node_list); + return node; +} + +static inline void free_stable_node(struct stable_node *stable_node) +{ + list_del(&stable_node->all_list); + kmem_cache_free(stable_node_cache, stable_node); +} + +static inline struct tree_node *alloc_tree_node(struct list_head *list) +{ + struct tree_node *node; + node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC); + if (!node) + return NULL; + + list_add(&node->all_list, list); + return node; +} + +static inline void free_tree_node(struct tree_node *node) +{ + list_del(&node->all_list); + kmem_cache_free(tree_node_cache, node); +} + +static void uksm_drop_anon_vma(struct rmap_item *rmap_item) +{ + struct anon_vma *anon_vma = rmap_item->anon_vma; + + put_anon_vma(anon_vma); +} + + +/** + * Remove a stable node from stable_tree, may unlink from its tree_node and + * may remove its parent tree_node if no other stable node is pending. + * + * @stable_node The node need to be removed + * @unlink_rb Will this node be unlinked from the rbtree? + * @remove_tree_ node Will its tree_node be removed if empty? + */ +static void remove_node_from_stable_tree(struct stable_node *stable_node, + int unlink_rb, int remove_tree_node) +{ + struct node_vma *node_vma; + struct rmap_item *rmap_item; + struct hlist_node *n; + + if (!hlist_empty(&stable_node->hlist)) { + hlist_for_each_entry_safe(node_vma, n, + &stable_node->hlist, hlist) { + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { + uksm_pages_sharing--; + + uksm_drop_anon_vma(rmap_item); + rmap_item->address &= PAGE_MASK; + } + free_node_vma(node_vma); + cond_resched(); + } + + /* the last one is counted as shared */ + uksm_pages_shared--; + uksm_pages_sharing++; + } + + if (stable_node->tree_node && unlink_rb) { + rb_erase(&stable_node->node, + &stable_node->tree_node->sub_root); + + if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && + remove_tree_node) { + rb_erase(&stable_node->tree_node->node, + root_stable_treep); + free_tree_node(stable_node->tree_node); + } else { + stable_node->tree_node->count--; + } + } + + free_stable_node(stable_node); +} + + +/* + * get_uksm_page: checks if the page indicated by the stable node + * is still its ksm page, despite having held no reference to it. + * In which case we can trust the content of the page, and it + * returns the gotten page; but if the page has now been zapped, + * remove the stale node from the stable tree and return NULL. + * + * You would expect the stable_node to hold a reference to the ksm page. + * But if it increments the page's count, swapping out has to wait for + * ksmd to come around again before it can free the page, which may take + * seconds or even minutes: much too unresponsive. So instead we use a + * "keyhole reference": access to the ksm page from the stable node peeps + * out through its keyhole to see if that page still holds the right key, + * pointing back to this stable node. This relies on freeing a PageAnon + * page to reset its page->mapping to NULL, and relies on no other use of + * a page to put something that might look like our key in page->mapping. + * + * include/linux/pagemap.h page_cache_get_speculative() is a good reference, + * but this is different - made simpler by uksm_thread_mutex being held, but + * interesting for assuming that no other use of the struct page could ever + * put our expected_mapping into page->mapping (or a field of the union which + * coincides with page->mapping). The RCU calls are not for KSM at all, but + * to keep the page_count protocol described with page_cache_get_speculative. + * + * Note: it is possible that get_uksm_page() will return NULL one moment, + * then page the next, if the page is in between page_freeze_refs() and + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page + * is on its way to being freed; but it is an anomaly to bear in mind. + * + * @unlink_rb: if the removal of this node will firstly unlink from + * its rbtree. stable_node_reinsert will prevent this when restructuring the + * node from its old tree. + * + * @remove_tree_node: if this is the last one of its tree_node, will the + * tree_node be freed ? If we are inserting stable node, this tree_node may + * be reused, so don't free it. + */ +static struct page *get_uksm_page(struct stable_node *stable_node, + int unlink_rb, int remove_tree_node) +{ + struct page *page; + void *expected_mapping; + + page = pfn_to_page(stable_node->kpfn); + expected_mapping = (void *)stable_node + + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + rcu_read_lock(); + if (page->mapping != expected_mapping) + goto stale; + if (!get_page_unless_zero(page)) + goto stale; + if (page->mapping != expected_mapping) { + put_page(page); + goto stale; + } + rcu_read_unlock(); + return page; +stale: + rcu_read_unlock(); + remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); + + return NULL; +} + +/* + * Removing rmap_item from stable or unstable tree. + * This function will clean the information from the stable/unstable tree. + */ +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +{ + if (rmap_item->address & STABLE_FLAG) { + struct stable_node *stable_node; + struct node_vma *node_vma; + struct page *page; + + node_vma = rmap_item->head; + stable_node = node_vma->head; + page = get_uksm_page(stable_node, 1, 1); + if (!page) + goto out; + + /* + * page lock is needed because it's racing with + * try_to_unmap_ksm(), etc. + */ + lock_page(page); + hlist_del(&rmap_item->hlist); + + if (hlist_empty(&node_vma->rmap_hlist)) { + hlist_del(&node_vma->hlist); + free_node_vma(node_vma); + } + unlock_page(page); + + put_page(page); + if (hlist_empty(&stable_node->hlist)) { + /* do NOT call remove_node_from_stable_tree() here, + * it's possible for a forked rmap_item not in + * stable tree while the in-tree rmap_items were + * deleted. + */ + uksm_pages_shared--; + } else + uksm_pages_sharing--; + + + uksm_drop_anon_vma(rmap_item); + } else if (rmap_item->address & UNSTABLE_FLAG) { + if (rmap_item->hash_round == uksm_hash_round) { + + rb_erase(&rmap_item->node, + &rmap_item->tree_node->sub_root); + if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { + rb_erase(&rmap_item->tree_node->node, + &root_unstable_tree); + + free_tree_node(rmap_item->tree_node); + } else + rmap_item->tree_node->count--; + } + uksm_pages_unshared--; + } + + rmap_item->address &= PAGE_MASK; + rmap_item->hash_max = 0; + +out: + cond_resched(); /* we're called from many long loops */ +} + +static inline int slot_in_uksm(struct vma_slot *slot) +{ + return list_empty(&slot->slot_list); +} + +/* + * Test if the mm is exiting + */ +static inline bool uksm_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +static inline unsigned long vma_pool_size(struct vma_slot *slot) +{ + return round_up(sizeof(struct rmap_list_entry) * slot->pages, + PAGE_SIZE) >> PAGE_SHIFT; +} + +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) + +/* must be done with sem locked */ +static int slot_pool_alloc(struct vma_slot *slot) +{ + unsigned long pool_size; + + if (slot->rmap_list_pool) + return 0; + + pool_size = vma_pool_size(slot); + slot->rmap_list_pool = kzalloc(sizeof(struct page *) * + pool_size, GFP_KERNEL); + if (!slot->rmap_list_pool) + return -ENOMEM; + + slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size, + GFP_KERNEL); + if (!slot->pool_counts) { + kfree(slot->rmap_list_pool); + return -ENOMEM; + } + + slot->pool_size = pool_size; + BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); + slot->flags |= UKSM_SLOT_IN_UKSM; + uksm_pages_total += slot->pages; + + return 0; +} + +/* + * Called after vma is unlinked from its mm + */ +void uksm_remove_vma(struct vm_area_struct *vma) +{ + struct vma_slot *slot; + + if (!vma->uksm_vma_slot) + return; + + spin_lock(&vma_slot_list_lock); + slot = vma->uksm_vma_slot; + if (!slot) + goto out; + + if (slot_in_uksm(slot)) { + /** + * This slot has been added by ksmd, so move to the del list + * waiting ksmd to free it. + */ + list_add_tail(&slot->slot_list, &vma_slot_del); + } else { + /** + * It's still on new list. It's ok to free slot directly. + */ + list_del(&slot->slot_list); + free_vma_slot(slot); + } +out: + vma->uksm_vma_slot = NULL; + spin_unlock(&vma_slot_list_lock); +} + +/** + * Need to do two things: + * 1. check if slot was moved to del list + * 2. make sure the mmap_sem is manipulated under valid vma. + * + * My concern here is that in some cases, this may make + * vma_slot_list_lock() waiters to serialized further by some + * sem->wait_lock, can this really be expensive? + * + * + * @return + * 0: if successfully locked mmap_sem + * -ENOENT: this slot was moved to del list + * -EBUSY: vma lock failed + */ +static int try_down_read_slot_mmap_sem(struct vma_slot *slot) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + struct rw_semaphore *sem; + + spin_lock(&vma_slot_list_lock); + + /* the slot_list was removed and inited from new list, when it enters + * uksm_list. If now it's not empty, then it must be moved to del list + */ + if (!slot_in_uksm(slot)) { + spin_unlock(&vma_slot_list_lock); + return -ENOENT; + } + + BUG_ON(slot->pages != vma_pages(slot->vma)); + /* Ok, vma still valid */ + vma = slot->vma; + mm = vma->vm_mm; + sem = &mm->mmap_sem; + + if (uksm_test_exit(mm)) { + spin_unlock(&vma_slot_list_lock); + return -ENOENT; + } + + if (down_read_trylock(sem)) { + spin_unlock(&vma_slot_list_lock); + if (slot_pool_alloc(slot)) { + uksm_remove_vma(vma); + up_read(sem); + return -ENOENT; + } + return 0; + } + + spin_unlock(&vma_slot_list_lock); + return -EBUSY; +} + +static inline unsigned long +vma_page_address(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { + /* page should be within @vma mapping range */ + return -EFAULT; + } + return address; +} + + +/* return 0 on success with the item's mmap_sem locked */ +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) +{ + struct mm_struct *mm; + struct vma_slot *slot = item->slot; + int err = -EINVAL; + + struct page *page; + + /* + * try_down_read_slot_mmap_sem() returns non-zero if the slot + * has been removed by uksm_remove_vma(). + */ + if (try_down_read_slot_mmap_sem(slot)) + return -EBUSY; + + mm = slot->vma->vm_mm; + + if (uksm_test_exit(mm)) + goto failout_up; + + page = item->page; + rcu_read_lock(); + if (!get_page_unless_zero(page)) { + rcu_read_unlock(); + goto failout_up; + } + + /* No need to consider huge page here. */ + if (item->slot->vma->anon_vma != page_anon_vma(page) || + vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { + /* + * TODO: + * should we release this item becase of its stale page + * mapping? + */ + put_page(page); + rcu_read_unlock(); + goto failout_up; + } + rcu_read_unlock(); + return 0; + +failout_up: + up_read(&mm->mmap_sem); + return err; +} + +/* + * What kind of VMA is considered ? + */ +static inline int vma_can_enter(struct vm_area_struct *vma) +{ + return uksm_flags_can_scan(vma->vm_flags); +} + +/* + * Called whenever a fresh new vma is created A new vma_slot. + * is created and inserted into a global list Must be called. + * after vma is inserted to its mm . + */ +void uksm_vma_add_new(struct vm_area_struct *vma) +{ + struct vma_slot *slot; + + if (!vma_can_enter(vma)) { + vma->uksm_vma_slot = NULL; + return; + } + + slot = alloc_vma_slot(); + if (!slot) { + vma->uksm_vma_slot = NULL; + return; + } + + vma->uksm_vma_slot = slot; + vma->vm_flags |= VM_MERGEABLE; + slot->vma = vma; + slot->mm = vma->vm_mm; + slot->ctime_j = jiffies; + slot->pages = vma_pages(vma); + spin_lock(&vma_slot_list_lock); + list_add_tail(&slot->slot_list, &vma_slot_new); + spin_unlock(&vma_slot_list_lock); +} + +/* 32/3 < they < 32/2 */ +#define shiftl 8 +#define shiftr 12 + +#define HASH_FROM_TO(from, to) \ +for (index = from; index < to; index++) { \ + pos = random_nums[index]; \ + hash += key[pos]; \ + hash += (hash << shiftl); \ + hash ^= (hash >> shiftr); \ +} + + +#define HASH_FROM_DOWN_TO(from, to) \ +for (index = from - 1; index >= to; index--) { \ + hash ^= (hash >> shiftr); \ + hash ^= (hash >> (shiftr*2)); \ + hash -= (hash << shiftl); \ + hash += (hash << (shiftl*2)); \ + pos = random_nums[index]; \ + hash -= key[pos]; \ +} + +/* + * The main random sample hash function. + */ +static u32 random_sample_hash(void *addr, u32 hash_strength) +{ + u32 hash = 0xdeadbeef; + int index, pos, loop = hash_strength; + u32 *key = (u32 *)addr; + + if (loop > HASH_STRENGTH_FULL) + loop = HASH_STRENGTH_FULL; + + HASH_FROM_TO(0, loop); + + if (hash_strength > HASH_STRENGTH_FULL) { + loop = hash_strength - HASH_STRENGTH_FULL; + HASH_FROM_TO(0, loop); + } + + return hash; +} + + +/** + * It's used when hash strength is adjusted + * + * @addr The page's virtual address + * @from The original hash strength + * @to The hash strength changed to + * @hash The hash value generated with "from" hash value + * + * return the hash value + */ +static u32 delta_hash(void *addr, int from, int to, u32 hash) +{ + u32 *key = (u32 *)addr; + int index, pos; /* make sure they are int type */ + + if (to > from) { + if (from >= HASH_STRENGTH_FULL) { + from -= HASH_STRENGTH_FULL; + to -= HASH_STRENGTH_FULL; + HASH_FROM_TO(from, to); + } else if (to <= HASH_STRENGTH_FULL) { + HASH_FROM_TO(from, to); + } else { + HASH_FROM_TO(from, HASH_STRENGTH_FULL); + HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); + } + } else { + if (from <= HASH_STRENGTH_FULL) { + HASH_FROM_DOWN_TO(from, to); + } else if (to >= HASH_STRENGTH_FULL) { + from -= HASH_STRENGTH_FULL; + to -= HASH_STRENGTH_FULL; + HASH_FROM_DOWN_TO(from, to); + } else { + HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); + HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); + } + } + + return hash; +} + +/** + * + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round + * has finished. + * + * return 0 if no page has been scanned since last call, 1 otherwise. + */ +static inline int encode_benefit(void) +{ + u64 scanned_delta, pos_delta, neg_delta; + unsigned long base = benefit.base; + + scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; + + if (!scanned_delta) + return 0; + + scanned_delta >>= base; + pos_delta = rshash_pos >> base; + neg_delta = rshash_neg >> base; + + if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || + CAN_OVERFLOW_U64(benefit.neg, neg_delta) || + CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { + benefit.scanned >>= 1; + benefit.neg >>= 1; + benefit.pos >>= 1; + benefit.base++; + scanned_delta >>= 1; + pos_delta >>= 1; + neg_delta >>= 1; + } + + benefit.pos += pos_delta; + benefit.neg += neg_delta; + benefit.scanned += scanned_delta; + + BUG_ON(!benefit.scanned); + + rshash_pos = rshash_neg = 0; + uksm_pages_scanned_last = uksm_pages_scanned; + + return 1; +} + +static inline void reset_benefit(void) +{ + benefit.pos = 0; + benefit.neg = 0; + benefit.base = 0; + benefit.scanned = 0; +} + +static inline void inc_rshash_pos(unsigned long delta) +{ + if (CAN_OVERFLOW_U64(rshash_pos, delta)) + encode_benefit(); + + rshash_pos += delta; +} + +static inline void inc_rshash_neg(unsigned long delta) +{ + if (CAN_OVERFLOW_U64(rshash_neg, delta)) + encode_benefit(); + + rshash_neg += delta; +} + + +static inline u32 page_hash(struct page *page, unsigned long hash_strength, + int cost_accounting) +{ + u32 val; + unsigned long delta; + + void *addr = kmap_atomic(page); + + val = random_sample_hash(addr, hash_strength); + kunmap_atomic(addr); + + if (cost_accounting) { + if (HASH_STRENGTH_FULL > hash_strength) + delta = HASH_STRENGTH_FULL - hash_strength; + else + delta = 0; + + inc_rshash_pos(delta); + } + + return val; +} + +static int memcmp_pages(struct page *page1, struct page *page2, + int cost_accounting) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1); + addr2 = kmap_atomic(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2); + kunmap_atomic(addr1); + + if (cost_accounting) + inc_rshash_neg(memcmp_cost); + + return ret; +} + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2, 0); +} + +static inline int is_page_full_zero(struct page *page) +{ + char *addr; + int ret; + + addr = kmap_atomic(page); + ret = is_full_zero(addr, PAGE_SIZE); + kunmap_atomic(addr); + + return ret; +} + +static int write_protect_page(struct vm_area_struct *vma, struct page *page, + pte_t *orig_pte, pte_t *old_pte) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr; + pte_t *ptep; + spinlock_t *ptl; + int swapped; + int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + BUG_ON(PageTransCompound(page)); + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + ptep = page_check_address(page, mm, addr, &ptl, 0); + if (!ptep) + goto out_mn; + + if (old_pte) + *old_pte = *ptep; + + if (pte_write(*ptep) || pte_dirty(*ptep)) { + pte_t entry; + + swapped = PageSwapCache(page); + flush_cache_page(vma, addr, page_to_pfn(page)); + /* + * Ok this is tricky, when get_user_pages_fast() run it doesnt + * take any lock, therefore the check that we are going to make + * with the pagecount against the mapcount is racey and + * O_DIRECT can happen right after the check. + * So we clear the pte and flush the tlb before the check + * this assure us that no O_DIRECT can happen after the check + * or in the middle of the check. + */ + entry = ptep_clear_flush_notify(vma, addr, ptep); + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if (page_mapcount(page) + 1 + swapped != page_count(page)) { + set_pte_at(mm, addr, ptep, entry); + goto out_unlock; + } + if (pte_dirty(entry)) + set_page_dirty(page); + entry = pte_mkclean(pte_wrprotect(entry)); + set_pte_at_notify(mm, addr, ptep, entry); + } + *orig_pte = *ptep; + err = 0; + +out_unlock: + pte_unmap_unlock(ptep, ptl); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out: + return err; +} + +#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ +#define MERGE_ERR_COLLI 2 /* there is a collision */ +#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ +#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ + + +/** + * replace_page - replace page in vma by new ksm page + * @vma: vma that holds the pte pointing to page + * @page: the page we are replacing by kpage + * @kpage: the ksm page we replace page by + * @orig_pte: the original value of the pte + * + * Returns 0 on success, MERGE_ERR_PGERR on failure. + */ +static int replace_page(struct vm_area_struct *vma, struct page *page, + struct page *kpage, pte_t orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + pte_t entry; + + unsigned long addr; + int err = MERGE_ERR_PGERR; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + BUG_ON(pmd_trans_huge(*pmd)); + if (!pmd_present(*pmd)) + goto out; + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_same(*ptep, orig_pte)) { + pte_unmap_unlock(ptep, ptl); + goto out_mn; + } + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + entry = mk_pte(kpage, vma->vm_page_prot); + + /* special treatment is needed for zero_page */ + if ((page_to_pfn(kpage) == uksm_zero_pfn) || + (page_to_pfn(kpage) == zero_pfn)) + entry = pte_mkspecial(entry); + else { + get_page(kpage); + page_add_anon_rmap(kpage, vma, addr); + } + + set_pte_at_notify(mm, addr, ptep, entry); + + page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); + put_page(page); + + pte_unmap_unlock(ptep, ptl); + err = 0; +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out: + return err; +} + + +/** + * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The + * zero hash value at HASH_STRENGTH_MAX is used to indicated that its + * hash_max member has not been calculated. + * + * @page The page needs to be hashed + * @hash_old The hash value calculated with current hash strength + * + * return the new hash value calculated at HASH_STRENGTH_MAX + */ +static inline u32 page_hash_max(struct page *page, u32 hash_old) +{ + u32 hash_max = 0; + void *addr; + + addr = kmap_atomic(page); + hash_max = delta_hash(addr, hash_strength, + HASH_STRENGTH_MAX, hash_old); + + kunmap_atomic(addr); + + if (!hash_max) + hash_max = 1; + + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); + return hash_max; +} + +/* + * We compare the hash again, to ensure that it is really a hash collision + * instead of being caused by page write. + */ +static inline int check_collision(struct rmap_item *rmap_item, + u32 hash) +{ + int err; + struct page *page = rmap_item->page; + + /* if this rmap_item has already been hash_maxed, then the collision + * must appears in the second-level rbtree search. In this case we check + * if its hash_max value has been changed. Otherwise, the collision + * happens in the first-level rbtree search, so we check against it's + * current hash value. + */ + if (rmap_item->hash_max) { + inc_rshash_neg(memcmp_cost); + inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); + + if (rmap_item->hash_max == page_hash_max(page, hash)) + err = MERGE_ERR_COLLI; + else + err = MERGE_ERR_CHANGED; + } else { + inc_rshash_neg(memcmp_cost + hash_strength); + + if (page_hash(page, hash_strength, 0) == hash) + err = MERGE_ERR_COLLI; + else + err = MERGE_ERR_CHANGED; + } + + return err; +} + +static struct page *page_trans_compound_anon(struct page *page) +{ + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + /* + * head may actually be splitted and freed from under + * us but it's ok here. + */ + if (PageAnon(head)) + return head; + } + return NULL; +} + +static int page_trans_compound_anon_split(struct page *page) +{ + int ret = 0; + struct page *transhuge_head = page_trans_compound_anon(page); + if (transhuge_head) { + /* Get the reference on the head to split it. */ + if (get_page_unless_zero(transhuge_head)) { + /* + * Recheck we got the reference while the head + * was still anonymous. + */ + if (PageAnon(transhuge_head)) + ret = split_huge_page(transhuge_head); + else + /* + * Retry later if split_huge_page run + * from under us. + */ + ret = 1; + put_page(transhuge_head); + } else + /* Retry later if split_huge_page run from under us. */ + ret = 1; + } + return ret; +} + +/** + * Try to merge a rmap_item.page with a kpage in stable node. kpage must + * already be a ksm page. + * + * @return 0 if the pages were merged, -EFAULT otherwise. + */ +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, + struct page *kpage, u32 hash) +{ + struct vm_area_struct *vma = rmap_item->slot->vma; + struct mm_struct *mm = vma->vm_mm; + pte_t orig_pte = __pte(0); + int err = MERGE_ERR_PGERR; + struct page *page; + + if (uksm_test_exit(mm)) + goto out; + + page = rmap_item->page; + + if (page == kpage) { /* ksm page forked */ + err = 0; + goto out; + } + + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); + + if (!PageAnon(page) || !PageKsm(kpage)) + goto out; + + /* + * We need the page lock to read a stable PageSwapCache in + * write_protect_page(). We use trylock_page() instead of + * lock_page() because we don't want to wait here - we + * prefer to continue scanning and merging different pages, + * then come back to this page when it is unlocked. + */ + if (!trylock_page(page)) + goto out; + /* + * If this anonymous page is mapped only here, its pte may need + * to be write-protected. If it's mapped elsewhere, all of its + * ptes are necessarily already write-protected. But in either + * case, we need to lock and check page_count is not raised. + */ + if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { + if (pages_identical(page, kpage)) + err = replace_page(vma, page, kpage, orig_pte); + else + err = check_collision(rmap_item, hash); + } + + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { + munlock_vma_page(page); + if (!PageMlocked(kpage)) { + unlock_page(page); + lock_page(kpage); + mlock_vma_page(kpage); + page = kpage; /* for final unlock */ + } + } + + unlock_page(page); +out: + return err; +} + + + +/** + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance + * to restore a page mapping that has been changed in try_to_merge_two_pages. + * + * @return 0 on success. + */ +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t orig_pte, pte_t wprt_pte) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + + int err = -EFAULT; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_same(*ptep, wprt_pte)) { + /* already copied, let it be */ + pte_unmap_unlock(ptep, ptl); + goto out; + } + + /* + * Good boy, still here. When we still get the ksm page, it does not + * return to the free page pool, there is no way that a pte was changed + * to other page and gets back to this page. And remind that ksm page + * do not reuse in do_wp_page(). So it's safe to restore the original + * pte. + */ + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, orig_pte); + + pte_unmap_unlock(ptep, ptl); + err = 0; +out: + return err; +} + +/** + * try_to_merge_two_pages() - take two identical pages and prepare + * them to be merged into one page(rmap_item->page) + * + * @return 0 if we successfully merged two identical pages into + * one ksm page. MERGE_ERR_COLLI if it's only a hash collision + * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been + * changed since it's hashed. MERGE_ERR_PGERR otherwise. + * + */ +static int try_to_merge_two_pages(struct rmap_item *rmap_item, + struct rmap_item *tree_rmap_item, + u32 hash) +{ + pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); + pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); + struct vm_area_struct *vma1 = rmap_item->slot->vma; + struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; + struct page *page = rmap_item->page; + struct page *tree_page = tree_rmap_item->page; + int err = MERGE_ERR_PGERR; + struct address_space *saved_mapping; + + + if (rmap_item->page == tree_rmap_item->page) + goto out; + + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); + + if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page)) + goto out; + BUG_ON(PageTransCompound(tree_page)); + + if (!PageAnon(page) || !PageAnon(tree_page)) + goto out; + + if (!trylock_page(page)) + goto out; + + + if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { + unlock_page(page); + goto out; + } + + /* + * While we hold page lock, upgrade page from + * PageAnon+anon_vma to PageKsm+NULL stable_node: + * stable_tree_insert() will update stable_node. + */ + saved_mapping = page->mapping; + set_page_stable_node(page, NULL); + mark_page_accessed(page); + unlock_page(page); + + if (!trylock_page(tree_page)) + goto restore_out; + + if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { + unlock_page(tree_page); + goto restore_out; + } + + if (pages_identical(page, tree_page)) { + err = replace_page(vma2, tree_page, page, wprt_pte2); + if (err) { + unlock_page(tree_page); + goto restore_out; + } + + if ((vma2->vm_flags & VM_LOCKED)) { + munlock_vma_page(tree_page); + if (!PageMlocked(page)) { + unlock_page(tree_page); + lock_page(page); + mlock_vma_page(page); + tree_page = page; /* for final unlock */ + } + } + + unlock_page(tree_page); + + goto out; /* success */ + + } else { + if (tree_rmap_item->hash_max && + tree_rmap_item->hash_max == rmap_item->hash_max) { + err = MERGE_ERR_COLLI_MAX; + } else if (page_hash(page, hash_strength, 0) == + page_hash(tree_page, hash_strength, 0)) { + inc_rshash_neg(memcmp_cost + hash_strength * 2); + err = MERGE_ERR_COLLI; + } else { + err = MERGE_ERR_CHANGED; + } + + unlock_page(tree_page); + } + +restore_out: + lock_page(page); + if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), + orig_pte1, wprt_pte1)) + page->mapping = saved_mapping; + + unlock_page(page); +out: + return err; +} + +static inline int hash_cmp(u32 new_val, u32 node_val) +{ + if (new_val > node_val) + return 1; + else if (new_val < node_val) + return -1; + else + return 0; +} + +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) +{ + u32 hash_max = item->hash_max; + + if (!hash_max) { + hash_max = page_hash_max(item->page, hash); + + item->hash_max = hash_max; + } + + return hash_max; +} + + + +/** + * stable_tree_search() - search the stable tree for a page + * + * @item: the rmap_item we are comparing with + * @hash: the hash value of this item->page already calculated + * + * @return the page we have found, NULL otherwise. The page returned has + * been gotten. + */ +static struct page *stable_tree_search(struct rmap_item *item, u32 hash) +{ + struct rb_node *node = root_stable_treep->rb_node; + struct tree_node *tree_node; + unsigned long hash_max; + struct page *page = item->page; + struct stable_node *stable_node; + + stable_node = page_stable_node(page); + if (stable_node) { + /* ksm page forked, that is + * if (PageKsm(page) && !in_stable_tree(rmap_item)) + * it's actually gotten once outside. + */ + get_page(page); + return page; + } + + while (node) { + int cmp; + + tree_node = rb_entry(node, struct tree_node, node); + + cmp = hash_cmp(hash, tree_node->hash); + + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else + break; + } + + if (!node) + return NULL; + + if (tree_node->count == 1) { + stable_node = rb_entry(tree_node->sub_root.rb_node, + struct stable_node, node); + BUG_ON(!stable_node); + + goto get_page_out; + } + + /* + * ok, we have to search the second + * level subtree, hash the page to a + * full strength. + */ + node = tree_node->sub_root.rb_node; + BUG_ON(!node); + hash_max = rmap_item_hash_max(item, hash); + + while (node) { + int cmp; + + stable_node = rb_entry(node, struct stable_node, node); + + cmp = hash_cmp(hash_max, stable_node->hash_max); + + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else + goto get_page_out; + } + + return NULL; + +get_page_out: + page = get_uksm_page(stable_node, 1, 1); + return page; +} + +static int try_merge_rmap_item(struct rmap_item *item, + struct page *kpage, + struct page *tree_page) +{ + spinlock_t *ptl; + pte_t *ptep; + unsigned long addr; + struct vm_area_struct *vma = item->slot->vma; + + addr = get_rmap_addr(item); + ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0); + if (!ptep) + return 0; + + if (pte_write(*ptep)) { + /* has changed, abort! */ + pte_unmap_unlock(ptep, ptl); + return 0; + } + + get_page(tree_page); + page_add_anon_rmap(tree_page, vma, addr); + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(vma->vm_mm, addr, ptep, + mk_pte(tree_page, vma->vm_page_prot)); + + page_remove_rmap(kpage); + put_page(kpage); + + pte_unmap_unlock(ptep, ptl); + + return 1; +} + +/** + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted + * into stable tree, the page was found to be identical to a stable ksm page, + * this is the last chance we can merge them into one. + * + * @item1: the rmap_item holding the page which we wanted to insert + * into stable tree. + * @item2: the other rmap_item we found when unstable tree search + * @oldpage: the page currently mapped by the two rmap_items + * @tree_page: the page we found identical in stable tree node + * @success1: return if item1 is successfully merged + * @success2: return if item2 is successfully merged + */ +static void try_merge_with_stable(struct rmap_item *item1, + struct rmap_item *item2, + struct page **kpage, + struct page *tree_page, + int *success1, int *success2) +{ + struct vm_area_struct *vma1 = item1->slot->vma; + struct vm_area_struct *vma2 = item2->slot->vma; + *success1 = 0; + *success2 = 0; + + if (unlikely(*kpage == tree_page)) { + /* I don't think this can really happen */ + printk(KERN_WARNING "UKSM: unexpected condition detected in " + "try_merge_with_stable() -- *kpage == tree_page !\n"); + *success1 = 1; + *success2 = 1; + return; + } + + if (!PageAnon(*kpage) || !PageKsm(*kpage)) + goto failed; + + if (!trylock_page(tree_page)) + goto failed; + + /* If the oldpage is still ksm and still pointed + * to in the right place, and still write protected, + * we are confident it's not changed, no need to + * memcmp anymore. + * be ware, we cannot take nested pte locks, + * deadlock risk. + */ + if (!try_merge_rmap_item(item1, *kpage, tree_page)) + goto unlock_failed; + + /* ok, then vma2, remind that pte1 already set */ + if (!try_merge_rmap_item(item2, *kpage, tree_page)) + goto success_1; + + *success2 = 1; +success_1: + *success1 = 1; + + + if ((*success1 && vma1->vm_flags & VM_LOCKED) || + (*success2 && vma2->vm_flags & VM_LOCKED)) { + munlock_vma_page(*kpage); + if (!PageMlocked(tree_page)) + mlock_vma_page(tree_page); + } + + /* + * We do not need oldpage any more in the caller, so can break the lock + * now. + */ + unlock_page(*kpage); + *kpage = tree_page; /* Get unlocked outside. */ + return; + +unlock_failed: + unlock_page(tree_page); +failed: + return; +} + +static inline void stable_node_hash_max(struct stable_node *node, + struct page *page, u32 hash) +{ + u32 hash_max = node->hash_max; + + if (!hash_max) { + hash_max = page_hash_max(page, hash); + node->hash_max = hash_max; + } +} + +static inline +struct stable_node *new_stable_node(struct tree_node *tree_node, + struct page *kpage, u32 hash_max) +{ + struct stable_node *new_stable_node; + + new_stable_node = alloc_stable_node(); + if (!new_stable_node) + return NULL; + + new_stable_node->kpfn = page_to_pfn(kpage); + new_stable_node->hash_max = hash_max; + new_stable_node->tree_node = tree_node; + set_page_stable_node(kpage, new_stable_node); + + return new_stable_node; +} + +static inline +struct stable_node *first_level_insert(struct tree_node *tree_node, + struct rmap_item *rmap_item, + struct rmap_item *tree_rmap_item, + struct page **kpage, u32 hash, + int *success1, int *success2) +{ + int cmp; + struct page *tree_page; + u32 hash_max = 0; + struct stable_node *stable_node, *new_snode; + struct rb_node *parent = NULL, **new; + + /* this tree node contains no sub-tree yet */ + stable_node = rb_entry(tree_node->sub_root.rb_node, + struct stable_node, node); + + tree_page = get_uksm_page(stable_node, 1, 0); + if (tree_page) { + cmp = memcmp_pages(*kpage, tree_page, 1); + if (!cmp) { + try_merge_with_stable(rmap_item, tree_rmap_item, kpage, + tree_page, success1, success2); + put_page(tree_page); + if (!*success1 && !*success2) + goto failed; + + return stable_node; + + } else { + /* + * collision in first level try to create a subtree. + * A new node need to be created. + */ + put_page(tree_page); + + stable_node_hash_max(stable_node, tree_page, + tree_node->hash); + hash_max = rmap_item_hash_max(rmap_item, hash); + cmp = hash_cmp(hash_max, stable_node->hash_max); + + parent = &stable_node->node; + if (cmp < 0) { + new = &parent->rb_left; + } else if (cmp > 0) { + new = &parent->rb_right; + } else { + goto failed; + } + } + + } else { + /* the only stable_node deleted, we reuse its tree_node. + */ + parent = NULL; + new = &tree_node->sub_root.rb_node; + } + + new_snode = new_stable_node(tree_node, *kpage, hash_max); + if (!new_snode) + goto failed; + + rb_link_node(&new_snode->node, parent, new); + rb_insert_color(&new_snode->node, &tree_node->sub_root); + tree_node->count++; + *success1 = *success2 = 1; + + return new_snode; + +failed: + return NULL; +} + +static inline +struct stable_node *stable_subtree_insert(struct tree_node *tree_node, + struct rmap_item *rmap_item, + struct rmap_item *tree_rmap_item, + struct page **kpage, u32 hash, + int *success1, int *success2) +{ + struct page *tree_page; + u32 hash_max; + struct stable_node *stable_node, *new_snode; + struct rb_node *parent, **new; + +research: + parent = NULL; + new = &tree_node->sub_root.rb_node; + BUG_ON(!*new); + hash_max = rmap_item_hash_max(rmap_item, hash); + while (*new) { + int cmp; + + stable_node = rb_entry(*new, struct stable_node, node); + + cmp = hash_cmp(hash_max, stable_node->hash_max); + + if (cmp < 0) { + parent = *new; + new = &parent->rb_left; + } else if (cmp > 0) { + parent = *new; + new = &parent->rb_right; + } else { + tree_page = get_uksm_page(stable_node, 1, 0); + if (tree_page) { + cmp = memcmp_pages(*kpage, tree_page, 1); + if (!cmp) { + try_merge_with_stable(rmap_item, + tree_rmap_item, kpage, + tree_page, success1, success2); + + put_page(tree_page); + if (!*success1 && !*success2) + goto failed; + /* + * successfully merged with a stable + * node + */ + return stable_node; + } else { + put_page(tree_page); + goto failed; + } + } else { + /* + * stable node may be deleted, + * and subtree maybe + * restructed, cannot + * continue, research it. + */ + if (tree_node->count) { + goto research; + } else { + /* reuse the tree node*/ + parent = NULL; + new = &tree_node->sub_root.rb_node; + } + } + } + } + + new_snode = new_stable_node(tree_node, *kpage, hash_max); + if (!new_snode) + goto failed; + + rb_link_node(&new_snode->node, parent, new); + rb_insert_color(&new_snode->node, &tree_node->sub_root); + tree_node->count++; + *success1 = *success2 = 1; + + return new_snode; + +failed: + return NULL; +} + + +/** + * stable_tree_insert() - try to insert a merged page in unstable tree to + * the stable tree + * + * @kpage: the page need to be inserted + * @hash: the current hash of this page + * @rmap_item: the rmap_item being scanned + * @tree_rmap_item: the rmap_item found on unstable tree + * @success1: return if rmap_item is merged + * @success2: return if tree_rmap_item is merged + * + * @return the stable_node on stable tree if at least one + * rmap_item is inserted into stable tree, NULL + * otherwise. + */ +static struct stable_node * +stable_tree_insert(struct page **kpage, u32 hash, + struct rmap_item *rmap_item, + struct rmap_item *tree_rmap_item, + int *success1, int *success2) +{ + struct rb_node **new = &root_stable_treep->rb_node; + struct rb_node *parent = NULL; + struct stable_node *stable_node; + struct tree_node *tree_node; + u32 hash_max = 0; + + *success1 = *success2 = 0; + + while (*new) { + int cmp; + + tree_node = rb_entry(*new, struct tree_node, node); + + cmp = hash_cmp(hash, tree_node->hash); + + if (cmp < 0) { + parent = *new; + new = &parent->rb_left; + } else if (cmp > 0) { + parent = *new; + new = &parent->rb_right; + } else + break; + } + + if (*new) { + if (tree_node->count == 1) { + stable_node = first_level_insert(tree_node, rmap_item, + tree_rmap_item, kpage, + hash, success1, success2); + } else { + stable_node = stable_subtree_insert(tree_node, + rmap_item, tree_rmap_item, kpage, + hash, success1, success2); + } + } else { + + /* no tree node found */ + tree_node = alloc_tree_node(stable_tree_node_listp); + if (!tree_node) { + stable_node = NULL; + goto out; + } + + stable_node = new_stable_node(tree_node, *kpage, hash_max); + if (!stable_node) { + free_tree_node(tree_node); + goto out; + } + + tree_node->hash = hash; + rb_link_node(&tree_node->node, parent, new); + rb_insert_color(&tree_node->node, root_stable_treep); + parent = NULL; + new = &tree_node->sub_root.rb_node; + + rb_link_node(&stable_node->node, parent, new); + rb_insert_color(&stable_node->node, &tree_node->sub_root); + tree_node->count++; + *success1 = *success2 = 1; + } + +out: + return stable_node; +} + + +/** + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem + * + * @return 0 on success, -EBUSY if unable to lock the mmap_sem, + * -EINVAL if the page mapping has been changed. + */ +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) +{ + int err; + + err = get_mergeable_page_lock_mmap(tree_rmap_item); + + if (err == -EINVAL) { + /* its page map has been changed, remove it */ + remove_rmap_item_from_tree(tree_rmap_item); + } + + /* The page is gotten and mmap_sem is locked now. */ + return err; +} + + +/** + * unstable_tree_search_insert() - search an unstable tree rmap_item with the + * same hash value. Get its page and trylock the mmap_sem + */ +static inline +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, + u32 hash) + +{ + struct rb_node **new = &root_unstable_tree.rb_node; + struct rb_node *parent = NULL; + struct tree_node *tree_node; + u32 hash_max; + struct rmap_item *tree_rmap_item; + + while (*new) { + int cmp; + + tree_node = rb_entry(*new, struct tree_node, node); + + cmp = hash_cmp(hash, tree_node->hash); + + if (cmp < 0) { + parent = *new; + new = &parent->rb_left; + } else if (cmp > 0) { + parent = *new; + new = &parent->rb_right; + } else + break; + } + + if (*new) { + /* got the tree_node */ + if (tree_node->count == 1) { + tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, + struct rmap_item, node); + BUG_ON(!tree_rmap_item); + + goto get_page_out; + } + + /* well, search the collision subtree */ + new = &tree_node->sub_root.rb_node; + BUG_ON(!*new); + hash_max = rmap_item_hash_max(rmap_item, hash); + + while (*new) { + int cmp; + + tree_rmap_item = rb_entry(*new, struct rmap_item, + node); + + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); + parent = *new; + if (cmp < 0) + new = &parent->rb_left; + else if (cmp > 0) + new = &parent->rb_right; + else + goto get_page_out; + } + } else { + /* alloc a new tree_node */ + tree_node = alloc_tree_node(&unstable_tree_node_list); + if (!tree_node) + return NULL; + + tree_node->hash = hash; + rb_link_node(&tree_node->node, parent, new); + rb_insert_color(&tree_node->node, &root_unstable_tree); + parent = NULL; + new = &tree_node->sub_root.rb_node; + } + + /* did not found even in sub-tree */ + rmap_item->tree_node = tree_node; + rmap_item->address |= UNSTABLE_FLAG; + rmap_item->hash_round = uksm_hash_round; + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, &tree_node->sub_root); + + uksm_pages_unshared++; + return NULL; + +get_page_out: + if (tree_rmap_item->page == rmap_item->page) + return NULL; + + if (get_tree_rmap_item_page(tree_rmap_item)) + return NULL; + + return tree_rmap_item; +} + +static void hold_anon_vma(struct rmap_item *rmap_item, + struct anon_vma *anon_vma) +{ + rmap_item->anon_vma = anon_vma; + get_anon_vma(anon_vma); +} + + +/** + * stable_tree_append() - append a rmap_item to a stable node. Deduplication + * ratio statistics is done in this function. + * + */ +static void stable_tree_append(struct rmap_item *rmap_item, + struct stable_node *stable_node, int logdedup) +{ + struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; + unsigned long key = (unsigned long)rmap_item->slot; + unsigned long factor = rmap_item->slot->rung->step; + + BUG_ON(!stable_node); + rmap_item->address |= STABLE_FLAG; + + if (hlist_empty(&stable_node->hlist)) { + uksm_pages_shared++; + goto node_vma_new; + } else { + uksm_pages_sharing++; + } + + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { + if (node_vma->key >= key) + break; + + if (logdedup) { + node_vma->slot->pages_bemerged += factor; + if (list_empty(&node_vma->slot->dedup_list)) + list_add(&node_vma->slot->dedup_list, + &vma_slot_dedup); + } + } + + if (node_vma) { + if (node_vma->key == key) { + node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); + goto node_vma_ok; + } else if (node_vma->key > key) { + node_vma_cont = node_vma; + } + } + +node_vma_new: + /* no same vma already in node, alloc a new node_vma */ + new_node_vma = alloc_node_vma(); + BUG_ON(!new_node_vma); + new_node_vma->head = stable_node; + new_node_vma->slot = rmap_item->slot; + + if (!node_vma) { + hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); + } else if (node_vma->key != key) { + if (node_vma->key < key) + hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); + else { + hlist_add_before(&new_node_vma->hlist, + &node_vma->hlist); + } + + } + node_vma = new_node_vma; + +node_vma_ok: /* ok, ready to add to the list */ + rmap_item->head = node_vma; + hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); + hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); + if (logdedup) { + rmap_item->slot->pages_merged++; + if (node_vma_cont) { + node_vma = node_vma_cont; + hlist_for_each_entry_continue(node_vma, hlist) { + node_vma->slot->pages_bemerged += factor; + if (list_empty(&node_vma->slot->dedup_list)) + list_add(&node_vma->slot->dedup_list, + &vma_slot_dedup); + } + } + } +} + +/* + * We use break_ksm to break COW on a ksm page: it's a stripped down + * + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) + * put_page(page); + * + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * in case the application has unmapped and remapped mm,addr meanwhile. + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + */ +static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +{ + struct page *page; + int ret = 0; + + do { + cond_resched(); + page = follow_page(vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) + break; + if (PageKsm(page)) { + ret = handle_mm_fault(vma->vm_mm, vma, addr, + FAULT_FLAG_WRITE); + } else + ret = VM_FAULT_WRITE; + put_page(page); + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); + /* + * We must loop because handle_mm_fault() may back out if there's + * any difficulty e.g. if pte accessed bit gets updated concurrently. + * + * VM_FAULT_WRITE is what we have been hoping for: it indicates that + * COW has been broken, even if the vma does not permit VM_WRITE; + * but note that a concurrent fault might break PageKsm for us. + * + * VM_FAULT_SIGBUS could occur if we race with truncation of the + * backing file, which also invalidates anonymous pages: that's + * okay, that truncation will have unmapped the PageKsm for us. + * + * VM_FAULT_OOM: at the time of writing (late July 2009), setting + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the + * current task has TIF_MEMDIE set, and will be OOM killed on return + * to user; and ksmd, having no mm, would never be chosen for that. + * + * But if the mm is in a limited mem_cgroup, then the fault may fail + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and + * even ksmd can fail in this way - though it's usually breaking ksm + * just to undo a merge it made a moment before, so unlikely to oom. + * + * That's a pity: we might therefore have more kernel pages allocated + * than we're counting as nodes in the stable tree; but uksm_do_scan + * will retry to break_cow on each pass, so should recover the page + * in due course. The important thing is to not let VM_MERGEABLE + * be cleared while any such pages might remain in the area. + */ + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; +} + +static void break_cow(struct rmap_item *rmap_item) +{ + struct vm_area_struct *vma = rmap_item->slot->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = get_rmap_addr(rmap_item); + + if (uksm_test_exit(mm)) + goto out; + + break_ksm(vma, addr); +out: + return; +} + +/* + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather + * than check every pte of a given vma, the locking doesn't quite work for + * that - an rmap_item is assigned to the stable tree after inserting ksm + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing + * rmap_items from parent to child at fork time (so as not to waste time + * if exit comes before the next scan reaches it). + * + * Similarly, although we'd like to remove rmap_items (so updating counts + * and freeing memory) when unmerging an area, it's easier to leave that + * to the next pass of ksmd - consider, for example, how ksmd might be + * in cmp_and_merge_page on one of the rmap_items we would be removing. + */ +inline int unmerge_uksm_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + unsigned long addr; + int err = 0; + + for (addr = start; addr < end && !err; addr += PAGE_SIZE) { + if (uksm_test_exit(vma->vm_mm)) + break; + if (signal_pending(current)) + err = -ERESTARTSYS; + else + err = break_ksm(vma, addr); + } + return err; +} + +static inline void inc_uksm_pages_scanned(void) +{ + u64 delta; + + + if (uksm_pages_scanned == U64_MAX) { + encode_benefit(); + + delta = uksm_pages_scanned >> pages_scanned_base; + + if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { + pages_scanned_stored >>= 1; + delta >>= 1; + pages_scanned_base++; + } + + pages_scanned_stored += delta; + + uksm_pages_scanned = uksm_pages_scanned_last = 0; + } + + uksm_pages_scanned++; +} + +static inline int find_zero_page_hash(int strength, u32 hash) +{ + return (zero_hash_table[strength] == hash); +} + +static +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) +{ + struct page *zero_page = empty_uksm_zero_page; + struct mm_struct *mm = vma->vm_mm; + pte_t orig_pte = __pte(0); + int err = -EFAULT; + + if (uksm_test_exit(mm)) + goto out; + + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); + + if (!PageAnon(page)) + goto out; + + if (!trylock_page(page)) + goto out; + + if (write_protect_page(vma, page, &orig_pte, 0) == 0) { + if (is_page_full_zero(page)) + err = replace_page(vma, page, zero_page, orig_pte); + } + + unlock_page(page); +out: + return err; +} + +/* + * cmp_and_merge_page() - first see if page can be merged into the stable + * tree; if not, compare hash to previous and if it's the same, see if page + * can be inserted into the unstable tree, or merged with a page already there + * and both transferred to the stable tree. + * + * @page: the page that we are searching identical page to. + * @rmap_item: the reverse mapping into the virtual address of this page + */ +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) +{ + struct rmap_item *tree_rmap_item; + struct page *page; + struct page *kpage = NULL; + u32 hash_max; + int err; + unsigned int success1, success2; + struct stable_node *snode; + int cmp; + struct rb_node *parent = NULL, **new; + + remove_rmap_item_from_tree(rmap_item); + page = rmap_item->page; + + /* We first start with searching the page inside the stable tree */ + kpage = stable_tree_search(rmap_item, hash); + if (kpage) { + err = try_to_merge_with_uksm_page(rmap_item, kpage, + hash); + if (!err) { + /* + * The page was successfully merged, add + * its rmap_item to the stable tree. + * page lock is needed because it's + * racing with try_to_unmap_ksm(), etc. + */ + lock_page(kpage); + snode = page_stable_node(kpage); + stable_tree_append(rmap_item, snode, 1); + unlock_page(kpage); + put_page(kpage); + return; /* success */ + } + put_page(kpage); + + /* + * if it's a collision and it has been search in sub-rbtree + * (hash_max != 0), we want to abort, because if it is + * successfully merged in unstable tree, the collision trends to + * happen again. + */ + if (err == MERGE_ERR_COLLI && rmap_item->hash_max) + return; + } + + tree_rmap_item = + unstable_tree_search_insert(rmap_item, hash); + if (tree_rmap_item) { + err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); + /* + * As soon as we merge this page, we want to remove the + * rmap_item of the page we have merged with from the unstable + * tree, and insert it instead as new node in the stable tree. + */ + if (!err) { + kpage = page; + remove_rmap_item_from_tree(tree_rmap_item); + lock_page(kpage); + snode = stable_tree_insert(&kpage, hash, + rmap_item, tree_rmap_item, + &success1, &success2); + + /* + * Do not log dedup for tree item, it's not counted as + * scanned in this round. + */ + if (success2) + stable_tree_append(tree_rmap_item, snode, 0); + + /* + * The order of these two stable append is important: + * we are scanning rmap_item. + */ + if (success1) + stable_tree_append(rmap_item, snode, 1); + + /* + * The original kpage may be unlocked inside + * stable_tree_insert() already. This page + * should be unlocked before doing + * break_cow(). + */ + unlock_page(kpage); + + if (!success1) + break_cow(rmap_item); + + if (!success2) + break_cow(tree_rmap_item); + + } else if (err == MERGE_ERR_COLLI) { + BUG_ON(tree_rmap_item->tree_node->count > 1); + + rmap_item_hash_max(tree_rmap_item, + tree_rmap_item->tree_node->hash); + + hash_max = rmap_item_hash_max(rmap_item, hash); + cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); + parent = &tree_rmap_item->node; + if (cmp < 0) + new = &parent->rb_left; + else if (cmp > 0) + new = &parent->rb_right; + else + goto put_up_out; + + rmap_item->tree_node = tree_rmap_item->tree_node; + rmap_item->address |= UNSTABLE_FLAG; + rmap_item->hash_round = uksm_hash_round; + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, + &tree_rmap_item->tree_node->sub_root); + rmap_item->tree_node->count++; + } else { + /* + * either one of the page has changed or they collide + * at the max hash, we consider them as ill items. + */ + remove_rmap_item_from_tree(tree_rmap_item); + } +put_up_out: + put_page(tree_rmap_item->page); + up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); + } +} + + + + +static inline unsigned long get_pool_index(struct vma_slot *slot, + unsigned long index) +{ + unsigned long pool_index; + + pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; + if (pool_index >= slot->pool_size) + BUG(); + return pool_index; +} + +static inline unsigned long index_page_offset(unsigned long index) +{ + return offset_in_page(sizeof(struct rmap_list_entry *) * index); +} + +static inline +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, + unsigned long index, int need_alloc) +{ + unsigned long pool_index; + struct page *page; + void *addr; + + + pool_index = get_pool_index(slot, index); + if (!slot->rmap_list_pool[pool_index]) { + if (!need_alloc) + return NULL; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); + if (!page) + return NULL; + + slot->rmap_list_pool[pool_index] = page; + } + + addr = kmap(slot->rmap_list_pool[pool_index]); + addr += index_page_offset(index); + + return addr; +} + +static inline void put_rmap_list_entry(struct vma_slot *slot, + unsigned long index) +{ + unsigned long pool_index; + + pool_index = get_pool_index(slot, index); + BUG_ON(!slot->rmap_list_pool[pool_index]); + kunmap(slot->rmap_list_pool[pool_index]); +} + +static inline int entry_is_new(struct rmap_list_entry *entry) +{ + return !entry->item; +} + +static inline unsigned long get_index_orig_addr(struct vma_slot *slot, + unsigned long index) +{ + return slot->vma->vm_start + (index << PAGE_SHIFT); +} + +static inline unsigned long get_entry_address(struct rmap_list_entry *entry) +{ + unsigned long addr; + + if (is_addr(entry->addr)) + addr = get_clean_addr(entry->addr); + else if (entry->item) + addr = get_rmap_addr(entry->item); + else + BUG(); + + return addr; +} + +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) +{ + if (is_addr(entry->addr)) + return NULL; + + return entry->item; +} + +static inline void inc_rmap_list_pool_count(struct vma_slot *slot, + unsigned long index) +{ + unsigned long pool_index; + + pool_index = get_pool_index(slot, index); + BUG_ON(!slot->rmap_list_pool[pool_index]); + slot->pool_counts[pool_index]++; +} + +static inline void dec_rmap_list_pool_count(struct vma_slot *slot, + unsigned long index) +{ + unsigned long pool_index; + + pool_index = get_pool_index(slot, index); + BUG_ON(!slot->rmap_list_pool[pool_index]); + BUG_ON(!slot->pool_counts[pool_index]); + slot->pool_counts[pool_index]--; +} + +static inline int entry_has_rmap(struct rmap_list_entry *entry) +{ + return !is_addr(entry->addr) && entry->item; +} + +static inline void swap_entries(struct rmap_list_entry *entry1, + unsigned long index1, + struct rmap_list_entry *entry2, + unsigned long index2) +{ + struct rmap_list_entry tmp; + + /* swapping two new entries is meaningless */ + BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); + + tmp = *entry1; + *entry1 = *entry2; + *entry2 = tmp; + + if (entry_has_rmap(entry1)) + entry1->item->entry_index = index1; + + if (entry_has_rmap(entry2)) + entry2->item->entry_index = index2; + + if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { + inc_rmap_list_pool_count(entry1->item->slot, index1); + dec_rmap_list_pool_count(entry1->item->slot, index2); + } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { + inc_rmap_list_pool_count(entry2->item->slot, index2); + dec_rmap_list_pool_count(entry2->item->slot, index1); + } +} + +static inline void free_entry_item(struct rmap_list_entry *entry) +{ + unsigned long index; + struct rmap_item *item; + + if (!is_addr(entry->addr)) { + BUG_ON(!entry->item); + item = entry->item; + entry->addr = get_rmap_addr(item); + set_is_addr(entry->addr); + index = item->entry_index; + remove_rmap_item_from_tree(item); + dec_rmap_list_pool_count(item->slot, index); + free_rmap_item(item); + } +} + +static inline int pool_entry_boundary(unsigned long index) +{ + unsigned long linear_addr; + + linear_addr = sizeof(struct rmap_list_entry *) * index; + return index && !offset_in_page(linear_addr); +} + +static inline void try_free_last_pool(struct vma_slot *slot, + unsigned long index) +{ + unsigned long pool_index; + + pool_index = get_pool_index(slot, index); + if (slot->rmap_list_pool[pool_index] && + !slot->pool_counts[pool_index]) { + __free_page(slot->rmap_list_pool[pool_index]); + slot->rmap_list_pool[pool_index] = NULL; + slot->flags |= UKSM_SLOT_NEED_SORT; + } + +} + +static inline unsigned long vma_item_index(struct vm_area_struct *vma, + struct rmap_item *item) +{ + return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; +} + +static int within_same_pool(struct vma_slot *slot, + unsigned long i, unsigned long j) +{ + unsigned long pool_i, pool_j; + + pool_i = get_pool_index(slot, i); + pool_j = get_pool_index(slot, j); + + return (pool_i == pool_j); +} + +static void sort_rmap_entry_list(struct vma_slot *slot) +{ + unsigned long i, j; + struct rmap_list_entry *entry, *swap_entry; + + entry = get_rmap_list_entry(slot, 0, 0); + for (i = 0; i < slot->pages; ) { + + if (!entry) + goto skip_whole_pool; + + if (entry_is_new(entry)) + goto next_entry; + + if (is_addr(entry->addr)) { + entry->addr = 0; + goto next_entry; + } + + j = vma_item_index(slot->vma, entry->item); + if (j == i) + goto next_entry; + + if (within_same_pool(slot, i, j)) + swap_entry = entry + j - i; + else + swap_entry = get_rmap_list_entry(slot, j, 1); + + swap_entries(entry, i, swap_entry, j); + if (!within_same_pool(slot, i, j)) + put_rmap_list_entry(slot, j); + continue; + +skip_whole_pool: + i += PAGE_SIZE / sizeof(*entry); + if (i < slot->pages) + entry = get_rmap_list_entry(slot, i, 0); + continue; + +next_entry: + if (i >= slot->pages - 1 || + !within_same_pool(slot, i, i + 1)) { + put_rmap_list_entry(slot, i); + if (i + 1 < slot->pages) + entry = get_rmap_list_entry(slot, i + 1, 0); + } else + entry++; + i++; + continue; + } + + /* free empty pool entries which contain no rmap_item */ + /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ + for (i = 0; i < slot->pool_size; i++) { + unsigned char has_rmap; + void *addr; + + if (!slot->rmap_list_pool[i]) + continue; + + has_rmap = 0; + addr = kmap(slot->rmap_list_pool[i]); + BUG_ON(!addr); + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { + entry = (struct rmap_list_entry *)addr + j; + if (is_addr(entry->addr)) + continue; + if (!entry->item) + continue; + has_rmap = 1; + } + kunmap(slot->rmap_list_pool[i]); + if (!has_rmap) { + BUG_ON(slot->pool_counts[i]); + __free_page(slot->rmap_list_pool[i]); + slot->rmap_list_pool[i] = NULL; + } + } + + slot->flags &= ~UKSM_SLOT_NEED_SORT; +} + +/* + * vma_fully_scanned() - if all the pages in this slot have been scanned. + */ +static inline int vma_fully_scanned(struct vma_slot *slot) +{ + return slot->pages_scanned == slot->pages; +} + +/** + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to + * its random permutation. This function is embedded with the random + * permutation index management code. + */ +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) +{ + unsigned long rand_range, addr, swap_index, scan_index; + struct rmap_item *item = NULL; + struct rmap_list_entry *scan_entry, *swap_entry = NULL; + struct page *page; + + scan_index = swap_index = slot->pages_scanned % slot->pages; + + if (pool_entry_boundary(scan_index)) + try_free_last_pool(slot, scan_index - 1); + + if (vma_fully_scanned(slot)) { + if (slot->flags & UKSM_SLOT_NEED_SORT) + slot->flags |= UKSM_SLOT_NEED_RERAND; + else + slot->flags &= ~UKSM_SLOT_NEED_RERAND; + if (slot->flags & UKSM_SLOT_NEED_SORT) + sort_rmap_entry_list(slot); + } + + scan_entry = get_rmap_list_entry(slot, scan_index, 1); + if (!scan_entry) + return NULL; + + if (entry_is_new(scan_entry)) { + scan_entry->addr = get_index_orig_addr(slot, scan_index); + set_is_addr(scan_entry->addr); + } + + if (slot->flags & UKSM_SLOT_NEED_RERAND) { + rand_range = slot->pages - scan_index; + BUG_ON(!rand_range); + swap_index = scan_index + (prandom_u32() % rand_range); + } + + if (swap_index != scan_index) { + swap_entry = get_rmap_list_entry(slot, swap_index, 1); + if (entry_is_new(swap_entry)) { + swap_entry->addr = get_index_orig_addr(slot, + swap_index); + set_is_addr(swap_entry->addr); + } + swap_entries(scan_entry, scan_index, swap_entry, swap_index); + } + + addr = get_entry_address(scan_entry); + item = get_entry_item(scan_entry); + BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); + + page = follow_page(slot->vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) + goto nopage; + + if (!PageAnon(page) && !page_trans_compound_anon(page)) + goto putpage; + + /*check is zero_page pfn or uksm_zero_page*/ + if ((page_to_pfn(page) == zero_pfn) + || (page_to_pfn(page) == uksm_zero_pfn)) + goto putpage; + + flush_anon_page(slot->vma, page, addr); + flush_dcache_page(page); + + + *hash = page_hash(page, hash_strength, 1); + inc_uksm_pages_scanned(); + /*if the page content all zero, re-map to zero-page*/ + if (find_zero_page_hash(hash_strength, *hash)) { + if (!cmp_and_merge_zero_page(slot->vma, page)) { + slot->pages_merged++; + inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); + dec_mm_counter(slot->mm, MM_ANONPAGES); + + /* For full-zero pages, no need to create rmap item */ + goto putpage; + } else { + inc_rshash_neg(memcmp_cost / 2); + } + } + + if (!item) { + item = alloc_rmap_item(); + if (item) { + /* It has already been zeroed */ + item->slot = slot; + item->address = addr; + item->entry_index = scan_index; + scan_entry->item = item; + inc_rmap_list_pool_count(slot, scan_index); + } else + goto putpage; + } + + BUG_ON(item->slot != slot); + /* the page may have changed */ + item->page = page; + put_rmap_list_entry(slot, scan_index); + if (swap_entry) + put_rmap_list_entry(slot, swap_index); + return item; + +putpage: + put_page(page); + page = NULL; +nopage: + /* no page, store addr back and free rmap_item if possible */ + free_entry_item(scan_entry); + put_rmap_list_entry(slot, scan_index); + if (swap_entry) + put_rmap_list_entry(slot, swap_index); + return NULL; +} + +static inline int in_stable_tree(struct rmap_item *rmap_item) +{ + return rmap_item->address & STABLE_FLAG; +} + +/** + * scan_vma_one_page() - scan the next page in a vma_slot. Called with + * mmap_sem locked. + */ +static noinline void scan_vma_one_page(struct vma_slot *slot) +{ + u32 hash; + struct mm_struct *mm; + struct rmap_item *rmap_item = NULL; + struct vm_area_struct *vma = slot->vma; + + mm = vma->vm_mm; + BUG_ON(!mm); + BUG_ON(!slot); + + rmap_item = get_next_rmap_item(slot, &hash); + if (!rmap_item) + goto out1; + + if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) + goto out2; + + cmp_and_merge_page(rmap_item, hash); +out2: + put_page(rmap_item->page); +out1: + slot->pages_scanned++; + if (slot->fully_scanned_round != fully_scanned_round) + scanned_virtual_pages++; + + if (vma_fully_scanned(slot)) + slot->fully_scanned_round = fully_scanned_round; +} + +static inline unsigned long rung_get_pages(struct scan_rung *rung) +{ + struct slot_tree_node *node; + + if (!rung->vma_root.rnode) + return 0; + + node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); + + return node->size; +} + +#define RUNG_SAMPLED_MIN 3 + +static inline +void uksm_calc_rung_step(struct scan_rung *rung, + unsigned long page_time, unsigned long ratio) +{ + unsigned long sampled, pages; + + /* will be fully scanned ? */ + if (!rung->cover_msecs) { + rung->step = 1; + return; + } + + sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) + * ratio / page_time; + + /* + * Before we finsish a scan round and expensive per-round jobs, + * we need to have a chance to estimate the per page time. So + * the sampled number can not be too small. + */ + if (sampled < RUNG_SAMPLED_MIN) + sampled = RUNG_SAMPLED_MIN; + + pages = rung_get_pages(rung); + if (likely(pages > sampled)) + rung->step = pages / sampled; + else + rung->step = 1; +} + +static inline int step_need_recalc(struct scan_rung *rung) +{ + unsigned long pages, stepmax; + + pages = rung_get_pages(rung); + stepmax = pages / RUNG_SAMPLED_MIN; + + return pages && (rung->step > pages || + (stepmax && rung->step > stepmax)); +} + +static inline +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) +{ + struct vma_slot *slot; + + if (finished) + rung->flags |= UKSM_RUNG_ROUND_FINISHED; + + if (step_recalc || step_need_recalc(rung)) { + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); + BUG_ON(step_need_recalc(rung)); + } + + slot_iter_index = prandom_u32() % rung->step; + BUG_ON(!rung->vma_root.rnode); + slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); + BUG_ON(!slot); + + rung->current_scan = slot; + rung->current_offset = slot_iter_index; +} + +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) +{ + return &slot->rung->vma_root; +} + +/* + * return if resetted. + */ +static int advance_current_scan(struct scan_rung *rung) +{ + unsigned short n; + struct vma_slot *slot, *next = NULL; + + BUG_ON(!rung->vma_root.num); + + slot = rung->current_scan; + n = (slot->pages - rung->current_offset) % rung->step; + slot_iter_index = rung->step - n; + next = sradix_tree_next(&rung->vma_root, slot->snode, + slot->sindex, slot_iter); + + if (next) { + rung->current_offset = slot_iter_index; + rung->current_scan = next; + return 0; + } else { + reset_current_scan(rung, 1, 0); + return 1; + } +} + +static inline void rung_rm_slot(struct vma_slot *slot) +{ + struct scan_rung *rung = slot->rung; + struct sradix_tree_root *root; + + if (rung->current_scan == slot) + advance_current_scan(rung); + + root = slot_get_root(slot); + sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); + slot->snode = NULL; + if (step_need_recalc(rung)) { + uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); + BUG_ON(step_need_recalc(rung)); + } + + /* In case advance_current_scan loop back to this slot again */ + if (rung->vma_root.num && rung->current_scan == slot) + reset_current_scan(slot->rung, 1, 0); +} + +static inline void rung_add_new_slots(struct scan_rung *rung, + struct vma_slot **slots, unsigned long num) +{ + int err; + struct vma_slot *slot; + unsigned long i; + struct sradix_tree_root *root = &rung->vma_root; + + err = sradix_tree_enter(root, (void **)slots, num); + BUG_ON(err); + + for (i = 0; i < num; i++) { + slot = slots[i]; + slot->rung = rung; + BUG_ON(vma_fully_scanned(slot)); + } + + if (rung->vma_root.num == num) + reset_current_scan(rung, 0, 1); +} + +static inline int rung_add_one_slot(struct scan_rung *rung, + struct vma_slot *slot) +{ + int err; + + err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); + if (err) + return err; + + slot->rung = rung; + if (rung->vma_root.num == 1) + reset_current_scan(rung, 0, 1); + + return 0; +} + +/* + * Return true if the slot is deleted from its rung. + */ +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) +{ + struct scan_rung *old_rung = slot->rung; + int err; + + if (old_rung == rung) + return 0; + + rung_rm_slot(slot); + err = rung_add_one_slot(rung, slot); + if (err) { + err = rung_add_one_slot(old_rung, slot); + WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ + } + + return 1; +} + +static inline int vma_rung_up(struct vma_slot *slot) +{ + struct scan_rung *rung; + + rung = slot->rung; + if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) + rung++; + + return vma_rung_enter(slot, rung); +} + +static inline int vma_rung_down(struct vma_slot *slot) +{ + struct scan_rung *rung; + + rung = slot->rung; + if (slot->rung != &uksm_scan_ladder[0]) + rung--; + + return vma_rung_enter(slot, rung); +} + +/** + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. + */ +static unsigned long cal_dedup_ratio(struct vma_slot *slot) +{ + unsigned long ret; + + BUG_ON(slot->pages_scanned == slot->last_scanned); + + ret = slot->pages_merged; + + /* Thrashing area filtering */ + if (ret && uksm_thrash_threshold) { + if (slot->pages_cowed * 100 / slot->pages_merged + > uksm_thrash_threshold) { + ret = 0; + } else { + ret = slot->pages_merged - slot->pages_cowed; + } + } + + return ret; +} + +/** + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. + */ +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) +{ + unsigned long ret; + unsigned long pages_scanned; + + pages_scanned = slot->pages_scanned; + if (!pages_scanned) { + if (uksm_thrash_threshold) + return 0; + else + pages_scanned = slot->pages_scanned; + } + + ret = slot->pages_bemerged * 100 / pages_scanned; + + /* Thrashing area filtering */ + if (ret && uksm_thrash_threshold) { + if (slot->pages_cowed * 100 / slot->pages_bemerged + > uksm_thrash_threshold) { + ret = 0; + } else { + ret = slot->pages_bemerged - slot->pages_cowed; + } + } + + return ret; +} + +/** + * stable_node_reinsert() - When the hash_strength has been adjusted, the + * stable tree need to be restructured, this is the function re-inserting the + * stable node. + */ +static inline void stable_node_reinsert(struct stable_node *new_node, + struct page *page, + struct rb_root *root_treep, + struct list_head *tree_node_listp, + u32 hash) +{ + struct rb_node **new = &root_treep->rb_node; + struct rb_node *parent = NULL; + struct stable_node *stable_node; + struct tree_node *tree_node; + struct page *tree_page; + int cmp; + + while (*new) { + int cmp; + + tree_node = rb_entry(*new, struct tree_node, node); + + cmp = hash_cmp(hash, tree_node->hash); + + if (cmp < 0) { + parent = *new; + new = &parent->rb_left; + } else if (cmp > 0) { + parent = *new; + new = &parent->rb_right; + } else + break; + } + + if (*new) { + /* find a stable tree node with same first level hash value */ + stable_node_hash_max(new_node, page, hash); + if (tree_node->count == 1) { + stable_node = rb_entry(tree_node->sub_root.rb_node, + struct stable_node, node); + tree_page = get_uksm_page(stable_node, 1, 0); + if (tree_page) { + stable_node_hash_max(stable_node, + tree_page, hash); + put_page(tree_page); + + /* prepare for stable node insertion */ + + cmp = hash_cmp(new_node->hash_max, + stable_node->hash_max); + parent = &stable_node->node; + if (cmp < 0) + new = &parent->rb_left; + else if (cmp > 0) + new = &parent->rb_right; + else + goto failed; + + goto add_node; + } else { + /* the only stable_node deleted, the tree node + * was not deleted. + */ + goto tree_node_reuse; + } + } + + /* well, search the collision subtree */ + new = &tree_node->sub_root.rb_node; + parent = NULL; + BUG_ON(!*new); + while (*new) { + int cmp; + + stable_node = rb_entry(*new, struct stable_node, node); + + cmp = hash_cmp(new_node->hash_max, + stable_node->hash_max); + + if (cmp < 0) { + parent = *new; + new = &parent->rb_left; + } else if (cmp > 0) { + parent = *new; + new = &parent->rb_right; + } else { + /* oh, no, still a collision */ + goto failed; + } + } + + goto add_node; + } + + /* no tree node found */ + tree_node = alloc_tree_node(tree_node_listp); + if (!tree_node) { + printk(KERN_ERR "UKSM: memory allocation error!\n"); + goto failed; + } else { + tree_node->hash = hash; + rb_link_node(&tree_node->node, parent, new); + rb_insert_color(&tree_node->node, root_treep); + +tree_node_reuse: + /* prepare for stable node insertion */ + parent = NULL; + new = &tree_node->sub_root.rb_node; + } + +add_node: + rb_link_node(&new_node->node, parent, new); + rb_insert_color(&new_node->node, &tree_node->sub_root); + new_node->tree_node = tree_node; + tree_node->count++; + return; + +failed: + /* This can only happen when two nodes have collided + * in two levels. + */ + new_node->tree_node = NULL; + return; +} + +static inline void free_all_tree_nodes(struct list_head *list) +{ + struct tree_node *node, *tmp; + + list_for_each_entry_safe(node, tmp, list, all_list) { + free_tree_node(node); + } +} + +/** + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash + * strength to the current hash_strength. It re-structures the hole tree. + */ +static inline void stable_tree_delta_hash(u32 prev_hash_strength) +{ + struct stable_node *node, *tmp; + struct rb_root *root_new_treep; + struct list_head *new_tree_node_listp; + + stable_tree_index = (stable_tree_index + 1) % 2; + root_new_treep = &root_stable_tree[stable_tree_index]; + new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; + *root_new_treep = RB_ROOT; + BUG_ON(!list_empty(new_tree_node_listp)); + + /* + * we need to be safe, the node could be removed by get_uksm_page() + */ + list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { + void *addr; + struct page *node_page; + u32 hash; + + /* + * We are completely re-structuring the stable nodes to a new + * stable tree. We don't want to touch the old tree unlinks and + * old tree_nodes. The old tree_nodes will be freed at once. + */ + node_page = get_uksm_page(node, 0, 0); + if (!node_page) + continue; + + if (node->tree_node) { + hash = node->tree_node->hash; + + addr = kmap_atomic(node_page); + + hash = delta_hash(addr, prev_hash_strength, + hash_strength, hash); + kunmap_atomic(addr); + } else { + /* + *it was not inserted to rbtree due to collision in last + *round scan. + */ + hash = page_hash(node_page, hash_strength, 0); + } + + stable_node_reinsert(node, node_page, root_new_treep, + new_tree_node_listp, hash); + put_page(node_page); + } + + root_stable_treep = root_new_treep; + free_all_tree_nodes(stable_tree_node_listp); + BUG_ON(!list_empty(stable_tree_node_listp)); + stable_tree_node_listp = new_tree_node_listp; +} + +static inline void inc_hash_strength(unsigned long delta) +{ + hash_strength += 1 << delta; + if (hash_strength > HASH_STRENGTH_MAX) + hash_strength = HASH_STRENGTH_MAX; +} + +static inline void dec_hash_strength(unsigned long delta) +{ + unsigned long change = 1 << delta; + + if (hash_strength <= change + 1) + hash_strength = 1; + else + hash_strength -= change; +} + +static inline void inc_hash_strength_delta(void) +{ + hash_strength_delta++; + if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) + hash_strength_delta = HASH_STRENGTH_DELTA_MAX; +} + +/* +static inline unsigned long get_current_neg_ratio(void) +{ + if (!rshash_pos || rshash_neg > rshash_pos) + return 100; + + return div64_u64(100 * rshash_neg , rshash_pos); +} +*/ + +static inline unsigned long get_current_neg_ratio(void) +{ + u64 pos = benefit.pos; + u64 neg = benefit.neg; + + if (!neg) + return 0; + + if (!pos || neg > pos) + return 100; + + if (neg > div64_u64(U64_MAX, 100)) + pos = div64_u64(pos, 100); + else + neg *= 100; + + return div64_u64(neg, pos); +} + +static inline unsigned long get_current_benefit(void) +{ + u64 pos = benefit.pos; + u64 neg = benefit.neg; + u64 scanned = benefit.scanned; + + if (neg > pos) + return 0; + + return div64_u64((pos - neg), scanned); +} + +static inline int judge_rshash_direction(void) +{ + u64 current_neg_ratio, stable_benefit; + u64 current_benefit, delta = 0; + int ret = STILL; + + /* Try to probe a value after the boot, and in case the system + are still for a long time. */ + if ((fully_scanned_round & 0xFFULL) == 10) { + ret = OBSCURE; + goto out; + } + + current_neg_ratio = get_current_neg_ratio(); + + if (current_neg_ratio == 0) { + rshash_neg_cont_zero++; + if (rshash_neg_cont_zero > 2) + return GO_DOWN; + else + return STILL; + } + rshash_neg_cont_zero = 0; + + if (current_neg_ratio > 90) { + ret = GO_UP; + goto out; + } + + current_benefit = get_current_benefit(); + stable_benefit = rshash_state.stable_benefit; + + if (!stable_benefit) { + ret = OBSCURE; + goto out; + } + + if (current_benefit > stable_benefit) + delta = current_benefit - stable_benefit; + else if (current_benefit < stable_benefit) + delta = stable_benefit - current_benefit; + + delta = div64_u64(100 * delta , stable_benefit); + + if (delta > 50) { + rshash_cont_obscure++; + if (rshash_cont_obscure > 2) + return OBSCURE; + else + return STILL; + } + +out: + rshash_cont_obscure = 0; + return ret; +} + +/** + * rshash_adjust() - The main function to control the random sampling state + * machine for hash strength adapting. + * + * return true if hash_strength has changed. + */ +static inline int rshash_adjust(void) +{ + unsigned long prev_hash_strength = hash_strength; + + if (!encode_benefit()) + return 0; + + switch (rshash_state.state) { + case RSHASH_STILL: + switch (judge_rshash_direction()) { + case GO_UP: + if (rshash_state.pre_direct == GO_DOWN) + hash_strength_delta = 0; + + inc_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + rshash_state.stable_benefit = get_current_benefit(); + rshash_state.pre_direct = GO_UP; + break; + + case GO_DOWN: + if (rshash_state.pre_direct == GO_UP) + hash_strength_delta = 0; + + dec_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + rshash_state.stable_benefit = get_current_benefit(); + rshash_state.pre_direct = GO_DOWN; + break; + + case OBSCURE: + rshash_state.stable_point = hash_strength; + rshash_state.turn_point_down = hash_strength; + rshash_state.turn_point_up = hash_strength; + rshash_state.turn_benefit_down = get_current_benefit(); + rshash_state.turn_benefit_up = get_current_benefit(); + rshash_state.lookup_window_index = 0; + rshash_state.state = RSHASH_TRYDOWN; + dec_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + break; + + case STILL: + break; + default: + BUG(); + } + break; + + case RSHASH_TRYDOWN: + if (rshash_state.lookup_window_index++ % 5 == 0) + rshash_state.below_count = 0; + + if (get_current_benefit() < rshash_state.stable_benefit) + rshash_state.below_count++; + else if (get_current_benefit() > + rshash_state.turn_benefit_down) { + rshash_state.turn_point_down = hash_strength; + rshash_state.turn_benefit_down = get_current_benefit(); + } + + if (rshash_state.below_count >= 3 || + judge_rshash_direction() == GO_UP || + hash_strength == 1) { + hash_strength = rshash_state.stable_point; + hash_strength_delta = 0; + inc_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + rshash_state.lookup_window_index = 0; + rshash_state.state = RSHASH_TRYUP; + hash_strength_delta = 0; + } else { + dec_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + } + break; + + case RSHASH_TRYUP: + if (rshash_state.lookup_window_index++ % 5 == 0) + rshash_state.below_count = 0; + + if (get_current_benefit() < rshash_state.turn_benefit_down) + rshash_state.below_count++; + else if (get_current_benefit() > rshash_state.turn_benefit_up) { + rshash_state.turn_point_up = hash_strength; + rshash_state.turn_benefit_up = get_current_benefit(); + } + + if (rshash_state.below_count >= 3 || + judge_rshash_direction() == GO_DOWN || + hash_strength == HASH_STRENGTH_MAX) { + hash_strength = rshash_state.turn_benefit_up > + rshash_state.turn_benefit_down ? + rshash_state.turn_point_up : + rshash_state.turn_point_down; + + rshash_state.state = RSHASH_PRE_STILL; + } else { + inc_hash_strength(hash_strength_delta); + inc_hash_strength_delta(); + } + + break; + + case RSHASH_NEW: + case RSHASH_PRE_STILL: + rshash_state.stable_benefit = get_current_benefit(); + rshash_state.state = RSHASH_STILL; + hash_strength_delta = 0; + break; + default: + BUG(); + } + + /* rshash_neg = rshash_pos = 0; */ + reset_benefit(); + + if (prev_hash_strength != hash_strength) + stable_tree_delta_hash(prev_hash_strength); + + return prev_hash_strength != hash_strength; +} + +/** + * round_update_ladder() - The main function to do update of all the + * adjustments whenever a scan round is finished. + */ +static noinline void round_update_ladder(void) +{ + int i; + unsigned long dedup; + struct vma_slot *slot, *tmp_slot; + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; + } + + list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { + + /* slot may be rung_rm_slot() when mm exits */ + if (slot->snode) { + dedup = cal_dedup_ratio_old(slot); + if (dedup && dedup >= uksm_abundant_threshold) + vma_rung_up(slot); + } + + slot->pages_bemerged = 0; + slot->pages_cowed = 0; + + list_del_init(&slot->dedup_list); + } +} + +static void uksm_del_vma_slot(struct vma_slot *slot) +{ + int i, j; + struct rmap_list_entry *entry; + + if (slot->snode) { + /* + * In case it just failed when entering the rung, it's not + * necessary. + */ + rung_rm_slot(slot); + } + + if (!list_empty(&slot->dedup_list)) + list_del(&slot->dedup_list); + + if (!slot->rmap_list_pool || !slot->pool_counts) { + /* In case it OOMed in uksm_vma_enter() */ + goto out; + } + + for (i = 0; i < slot->pool_size; i++) { + void *addr; + + if (!slot->rmap_list_pool[i]) + continue; + + addr = kmap(slot->rmap_list_pool[i]); + for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { + entry = (struct rmap_list_entry *)addr + j; + if (is_addr(entry->addr)) + continue; + if (!entry->item) + continue; + + remove_rmap_item_from_tree(entry->item); + free_rmap_item(entry->item); + slot->pool_counts[i]--; + } + BUG_ON(slot->pool_counts[i]); + kunmap(slot->rmap_list_pool[i]); + __free_page(slot->rmap_list_pool[i]); + } + kfree(slot->rmap_list_pool); + kfree(slot->pool_counts); + +out: + slot->rung = NULL; + if (slot->flags & UKSM_SLOT_IN_UKSM) { + BUG_ON(uksm_pages_total < slot->pages); + uksm_pages_total -= slot->pages; + } + + if (slot->fully_scanned_round == fully_scanned_round) + scanned_virtual_pages -= slot->pages; + else + scanned_virtual_pages -= slot->pages_scanned; + free_vma_slot(slot); +} + + +#define SPIN_LOCK_PERIOD 32 +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; +static inline void cleanup_vma_slots(void) +{ + struct vma_slot *slot; + int i; + + i = 0; + spin_lock(&vma_slot_list_lock); + while (!list_empty(&vma_slot_del)) { + slot = list_entry(vma_slot_del.next, + struct vma_slot, slot_list); + list_del(&slot->slot_list); + cleanup_slots[i++] = slot; + if (i == SPIN_LOCK_PERIOD) { + spin_unlock(&vma_slot_list_lock); + while (--i >= 0) + uksm_del_vma_slot(cleanup_slots[i]); + i = 0; + spin_lock(&vma_slot_list_lock); + } + } + spin_unlock(&vma_slot_list_lock); + + while (--i >= 0) + uksm_del_vma_slot(cleanup_slots[i]); +} + +/* +*expotional moving average formula +*/ +static inline unsigned long ema(unsigned long curr, unsigned long last_ema) +{ + /* + * For a very high burst, even the ema cannot work well, a false very + * high per-page time estimation can result in feedback in very high + * overhead of context swith and rung update -- this will then lead + * to higher per-paper time, this may not converge. + * + * Instead, we try to approach this value in a binary manner. + */ + if (curr > last_ema * 10) + return last_ema * 2; + + return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; +} + +/* + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to + * nanoseconds based on current uksm_sleep_jiffies. + */ +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) +{ + return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / + (TIME_RATIO_SCALE - ratio) * ratio; +} + + +static inline unsigned long rung_real_ratio(int cpu_time_ratio) +{ + unsigned long ret; + + BUG_ON(!cpu_time_ratio); + + if (cpu_time_ratio > 0) + ret = cpu_time_ratio; + else + ret = (unsigned long)(-cpu_time_ratio) * + uksm_max_cpu_percentage / 100UL; + + return ret ? ret : 1; +} + +static noinline void uksm_calc_scan_pages(void) +{ + struct scan_rung *ladder = uksm_scan_ladder; + unsigned long sleep_usecs, nsecs; + unsigned long ratio; + int i; + unsigned long per_page; + + if (uksm_ema_page_time > 100000 || + (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) + uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; + + per_page = uksm_ema_page_time; + BUG_ON(!per_page); + + /* + * For every 8 eval round, we try to probe a uksm_sleep_jiffies value + * based on saved user input. + */ + if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) + uksm_sleep_jiffies = uksm_sleep_saved; + + /* We require a rung scan at least 1 page in a period. */ + nsecs = per_page; + ratio = rung_real_ratio(ladder[0].cpu_ratio); + if (cpu_ratio_to_nsec(ratio) < nsecs) { + sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio + / NSEC_PER_USEC; + uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; + } + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + ratio = rung_real_ratio(ladder[i].cpu_ratio); + ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / + per_page; + BUG_ON(!ladder[i].pages_to_scan); + uksm_calc_rung_step(&ladder[i], per_page, ratio); + } +} + +/* + * From the scan time of this round (ns) to next expected min sleep time + * (ms), be careful of the possible overflows. ratio is taken from + * rung_real_ratio() + */ +static inline +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) +{ + scan_time >>= 20; /* to msec level now */ + BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); + + return (unsigned int) ((unsigned long) scan_time * + (TIME_RATIO_SCALE - ratio) / ratio); +} + +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) + +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) +{ + struct scan_rung *rung; + + rung = &uksm_scan_ladder[0]; + rung_add_new_slots(rung, slots, num); +} + +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; + +static void uksm_enter_all_slots(void) +{ + struct vma_slot *slot; + unsigned long index; + struct list_head empty_vma_list; + int i; + + i = 0; + index = 0; + INIT_LIST_HEAD(&empty_vma_list); + + spin_lock(&vma_slot_list_lock); + while (!list_empty(&vma_slot_new)) { + slot = list_entry(vma_slot_new.next, + struct vma_slot, slot_list); + + if (!slot->vma->anon_vma) { + list_move(&slot->slot_list, &empty_vma_list); + } else if (vma_can_enter(slot->vma)) { + batch_slots[index++] = slot; + list_del_init(&slot->slot_list); + } else { + list_move(&slot->slot_list, &vma_slot_noadd); + } + + if (++i == SPIN_LOCK_PERIOD || + (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { + spin_unlock(&vma_slot_list_lock); + + if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { + uksm_vma_enter(batch_slots, index); + index = 0; + } + i = 0; + cond_resched(); + spin_lock(&vma_slot_list_lock); + } + } + + list_splice(&empty_vma_list, &vma_slot_new); + + spin_unlock(&vma_slot_list_lock); + + if (index) + uksm_vma_enter(batch_slots, index); + +} + +static inline int rung_round_finished(struct scan_rung *rung) +{ + return rung->flags & UKSM_RUNG_ROUND_FINISHED; +} + +static inline void judge_slot(struct vma_slot *slot) +{ + struct scan_rung *rung = slot->rung; + unsigned long dedup; + int deleted; + + dedup = cal_dedup_ratio(slot); + if (vma_fully_scanned(slot) && uksm_thrash_threshold) + deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); + else if (dedup && dedup >= uksm_abundant_threshold) + deleted = vma_rung_up(slot); + else + deleted = vma_rung_down(slot); + + slot->pages_merged = 0; + slot->pages_cowed = 0; + + if (vma_fully_scanned(slot)) + slot->pages_scanned = 0; + + slot->last_scanned = slot->pages_scanned; + + /* If its deleted in above, then rung was already advanced. */ + if (!deleted) + advance_current_scan(rung); +} + + +static inline int hash_round_finished(void) +{ + if (scanned_virtual_pages > (uksm_pages_total >> 2)) { + scanned_virtual_pages = 0; + if (uksm_pages_scanned) + fully_scanned_round++; + + return 1; + } else { + return 0; + } +} + +#define UKSM_MMSEM_BATCH 5 +#define BUSY_RETRY 100 + +/** + * uksm_do_scan() - the main worker function. + */ +static noinline void uksm_do_scan(void) +{ + struct vma_slot *slot, *iter; + struct mm_struct *busy_mm; + unsigned char round_finished, all_rungs_emtpy; + int i, err, mmsem_batch; + unsigned long pcost; + long long delta_exec; + unsigned long vpages, max_cpu_ratio; + unsigned long long start_time, end_time, scan_time; + unsigned int expected_jiffies; + + might_sleep(); + + vpages = 0; + + start_time = task_sched_runtime(current); + max_cpu_ratio = 0; + mmsem_batch = 0; + + for (i = 0; i < SCAN_LADDER_SIZE;) { + struct scan_rung *rung = &uksm_scan_ladder[i]; + unsigned long ratio; + int busy_retry; + + if (!rung->pages_to_scan) { + i++; + continue; + } + + if (!rung->vma_root.num) { + rung->pages_to_scan = 0; + i++; + continue; + } + + ratio = rung_real_ratio(rung->cpu_ratio); + if (ratio > max_cpu_ratio) + max_cpu_ratio = ratio; + + busy_retry = BUSY_RETRY; + /* + * Do not consider rung_round_finished() here, just used up the + * rung->pages_to_scan quota. + */ + while (rung->pages_to_scan && rung->vma_root.num && + likely(!freezing(current))) { + int reset = 0; + + slot = rung->current_scan; + + BUG_ON(vma_fully_scanned(slot)); + + if (mmsem_batch) { + err = 0; + } else { + err = try_down_read_slot_mmap_sem(slot); + } + + if (err == -ENOENT) { +rm_slot: + rung_rm_slot(slot); + continue; + } + + busy_mm = slot->mm; + + if (err == -EBUSY) { + /* skip other vmas on the same mm */ + do { + reset = advance_current_scan(rung); + iter = rung->current_scan; + busy_retry--; + if (iter->vma->vm_mm != busy_mm || + !busy_retry || reset) + break; + } while (1); + + if (iter->vma->vm_mm != busy_mm) { + continue; + } else { + /* scan round finsished */ + break; + } + } + + BUG_ON(!vma_can_enter(slot->vma)); + if (uksm_test_exit(slot->vma->vm_mm)) { + mmsem_batch = 0; + up_read(&slot->vma->vm_mm->mmap_sem); + goto rm_slot; + } + + if (mmsem_batch) + mmsem_batch--; + else + mmsem_batch = UKSM_MMSEM_BATCH; + + /* Ok, we have take the mmap_sem, ready to scan */ + scan_vma_one_page(slot); + rung->pages_to_scan--; + vpages++; + + if (rung->current_offset + rung->step > slot->pages - 1 + || vma_fully_scanned(slot)) { + up_read(&slot->vma->vm_mm->mmap_sem); + judge_slot(slot); + mmsem_batch = 0; + } else { + rung->current_offset += rung->step; + if (!mmsem_batch) + up_read(&slot->vma->vm_mm->mmap_sem); + } + + busy_retry = BUSY_RETRY; + cond_resched(); + } + + if (mmsem_batch) { + up_read(&slot->vma->vm_mm->mmap_sem); + mmsem_batch = 0; + } + + if (freezing(current)) + break; + + cond_resched(); + } + end_time = task_sched_runtime(current); + delta_exec = end_time - start_time; + + if (freezing(current)) + return; + + cleanup_vma_slots(); + uksm_enter_all_slots(); + + round_finished = 1; + all_rungs_emtpy = 1; + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + struct scan_rung *rung = &uksm_scan_ladder[i]; + + if (rung->vma_root.num) { + all_rungs_emtpy = 0; + if (!rung_round_finished(rung)) + round_finished = 0; + } + } + + if (all_rungs_emtpy) + round_finished = 0; + + if (round_finished) { + round_update_ladder(); + uksm_eval_round++; + + if (hash_round_finished() && rshash_adjust()) { + /* Reset the unstable root iff hash strength changed */ + uksm_hash_round++; + root_unstable_tree = RB_ROOT; + free_all_tree_nodes(&unstable_tree_node_list); + } + + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to uksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + } + + + if (vpages && delta_exec > 0) { + pcost = (unsigned long) delta_exec / vpages; + if (likely(uksm_ema_page_time)) + uksm_ema_page_time = ema(pcost, uksm_ema_page_time); + else + uksm_ema_page_time = pcost; + } + + uksm_calc_scan_pages(); + uksm_sleep_real = uksm_sleep_jiffies; + /* in case of radical cpu bursts, apply the upper bound */ + end_time = task_sched_runtime(current); + if (max_cpu_ratio && end_time > start_time) { + scan_time = end_time - start_time; + expected_jiffies = msecs_to_jiffies( + scan_time_to_sleep(scan_time, max_cpu_ratio)); + + if (expected_jiffies > uksm_sleep_real) + uksm_sleep_real = expected_jiffies; + + /* We have a 1 second up bound for responsiveness. */ + if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) + uksm_sleep_real = msecs_to_jiffies(1000); + } + + return; +} + +static int ksmd_should_run(void) +{ + return uksm_run & UKSM_RUN_MERGE; +} + +static int uksm_scan_thread(void *nothing) +{ + set_freezable(); + set_user_nice(current, 5); + + while (!kthread_should_stop()) { + mutex_lock(&uksm_thread_mutex); + if (ksmd_should_run()) { + uksm_do_scan(); + } + mutex_unlock(&uksm_thread_mutex); + + try_to_freeze(); + + if (ksmd_should_run()) { + schedule_timeout_interruptible(uksm_sleep_real); + uksm_sleep_times++; + } else { + wait_event_freezable(uksm_thread_wait, + ksmd_should_run() || kthread_should_stop()); + } + } + return 0; +} + +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) +{ + struct stable_node *stable_node; + struct node_vma *node_vma; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + unsigned long address; + + VM_BUG_ON_PAGE(!PageKsm(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + + stable_node = page_stable_node(page); + if (!stable_node) + return ret; +again: + hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { + hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + anon_vma_lock_read(anon_vma); + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { + vma = vmac->vma; + address = get_rmap_addr(rmap_item); + + if (address < vma->vm_start || + address >= vma->vm_end) + continue; + + if ((rmap_item->slot->vma == vma) == + search_new_forks) + continue; + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + + ret = rwc->rmap_one(page, vma, address, rwc->arg); + if (ret != SWAP_AGAIN) { + anon_vma_unlock_read(anon_vma); + goto out; + } + + if (rwc->done && rwc->done(page)) { + anon_vma_unlock_read(anon_vma); + goto out; + } + } + anon_vma_unlock_read(anon_vma); + } + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +#ifdef CONFIG_MIGRATION +/* Common ksm interface but may be specific to uksm */ +void ksm_migrate_page(struct page *newpage, struct page *oldpage) +{ + struct stable_node *stable_node; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON(newpage->mapping != oldpage->mapping); + + stable_node = page_stable_node(newpage); + if (stable_node) { + VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); + stable_node->kpfn = page_to_pfn(newpage); + } +} +#endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_MEMORY_HOTREMOVE +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, + unsigned long end_pfn) +{ + struct rb_node *node; + + for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { + struct stable_node *stable_node; + + stable_node = rb_entry(node, struct stable_node, node); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) + return stable_node; + } + return NULL; +} + +static int uksm_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + struct stable_node *stable_node; + + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Keep it very simple for now: just lock out ksmd and + * MADV_UNMERGEABLE while any memory is going offline. + * mutex_lock_nested() is necessary because lockdep was alarmed + * that here we take uksm_thread_mutex inside notifier chain + * mutex, and later take notifier chain mutex inside + * uksm_thread_mutex to unlock it. But that's safe because both + * are inside mem_hotplug_mutex. + */ + mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); + break; + + case MEM_OFFLINE: + /* + * Most of the work is done by page migration; but there might + * be a few stable_nodes left over, still pointing to struct + * pages which have been offlined: prune those from the tree. + */ + while ((stable_node = uksm_check_stable_tree(mn->start_pfn, + mn->start_pfn + mn->nr_pages)) != NULL) + remove_node_from_stable_tree(stable_node, 1, 1); + /* fallthrough */ + + case MEM_CANCEL_OFFLINE: + mutex_unlock(&uksm_thread_mutex); + break; + } + return NOTIFY_OK; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +#ifdef CONFIG_SYSFS +/* + * This all compiles without CONFIG_SYSFS, but is a waste of space. + */ + +#define UKSM_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define UKSM_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t max_cpu_percentage_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", uksm_max_cpu_percentage); +} + +static ssize_t max_cpu_percentage_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long max_cpu_percentage; + int err; + + err = kstrtoul(buf, 10, &max_cpu_percentage); + if (err || max_cpu_percentage > 100) + return -EINVAL; + + if (max_cpu_percentage == 100) + max_cpu_percentage = 99; + else if (max_cpu_percentage < 10) + max_cpu_percentage = 10; + + uksm_max_cpu_percentage = max_cpu_percentage; + + return count; +} +UKSM_ATTR(max_cpu_percentage); + +static ssize_t sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); +} + +static ssize_t sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > MSEC_PER_SEC) + return -EINVAL; + + uksm_sleep_jiffies = msecs_to_jiffies(msecs); + uksm_sleep_saved = uksm_sleep_jiffies; + + return count; +} +UKSM_ATTR(sleep_millisecs); + + +static ssize_t cpu_governor_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); + int i; + + buf[0] = '\0'; + for (i = 0; i < n ; i++) { + if (uksm_cpu_governor == i) + strcat(buf, "["); + + strcat(buf, uksm_cpu_governor_str[i]); + + if (uksm_cpu_governor == i) + strcat(buf, "]"); + + strcat(buf, " "); + } + strcat(buf, "\n"); + + return strlen(buf); +} + +static inline void init_performance_values(void) +{ + int i; + struct scan_rung *rung; + struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; + + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = uksm_scan_ladder + i; + rung->cpu_ratio = preset->cpu_ratio[i]; + rung->cover_msecs = preset->cover_msecs[i]; + } + + uksm_max_cpu_percentage = preset->max_cpu; +} + +static ssize_t cpu_governor_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); + + for (n--; n >=0 ; n--) { + if (!strncmp(buf, uksm_cpu_governor_str[n], + strlen(uksm_cpu_governor_str[n]))) + break; + } + + if (n < 0) + return -EINVAL; + else + uksm_cpu_governor = n; + + init_performance_values(); + + return count; +} +UKSM_ATTR(cpu_governor); + +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", uksm_run); +} + +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long flags; + + err = kstrtoul(buf, 10, &flags); + if (err || flags > UINT_MAX) + return -EINVAL; + if (flags > UKSM_RUN_MERGE) + return -EINVAL; + + mutex_lock(&uksm_thread_mutex); + if (uksm_run != flags) { + uksm_run = flags; + } + mutex_unlock(&uksm_thread_mutex); + + if (flags & UKSM_RUN_MERGE) + wake_up_interruptible(&uksm_thread_wait); + + return count; +} +UKSM_ATTR(run); + +static ssize_t abundant_threshold_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", uksm_abundant_threshold); +} + +static ssize_t abundant_threshold_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long flags; + + err = kstrtoul(buf, 10, &flags); + if (err || flags > 99) + return -EINVAL; + + uksm_abundant_threshold = flags; + + return count; +} +UKSM_ATTR(abundant_threshold); + +static ssize_t thrash_threshold_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", uksm_thrash_threshold); +} + +static ssize_t thrash_threshold_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long flags; + + err = kstrtoul(buf, 10, &flags); + if (err || flags > 99) + return -EINVAL; + + uksm_thrash_threshold = flags; + + return count; +} +UKSM_ATTR(thrash_threshold); + +static ssize_t cpu_ratios_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, size; + struct scan_rung *rung; + char *p = buf; + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = &uksm_scan_ladder[i]; + + if (rung->cpu_ratio > 0) + size = sprintf(p, "%d ", rung->cpu_ratio); + else + size = sprintf(p, "MAX/%d ", + TIME_RATIO_SCALE / -rung->cpu_ratio); + + p += size; + } + + *p++ = '\n'; + *p = '\0'; + + return p - buf; +} + +static ssize_t cpu_ratios_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int i, cpuratios[SCAN_LADDER_SIZE], err; + unsigned long value; + struct scan_rung *rung; + char *p, *end = NULL; + + p = kzalloc(count, GFP_KERNEL); + if (!p) + return -ENOMEM; + + memcpy(p, buf, count); + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + if (i != SCAN_LADDER_SIZE -1) { + end = strchr(p, ' '); + if (!end) + return -EINVAL; + + *end = '\0'; + } + + if (strstr(p, "MAX/")) { + p = strchr(p, '/') + 1; + err = kstrtoul(p, 10, &value); + if (err || value > TIME_RATIO_SCALE || !value) + return -EINVAL; + + cpuratios[i] = - (int) (TIME_RATIO_SCALE / value); + } else { + err = kstrtoul(p, 10, &value); + if (err || value > TIME_RATIO_SCALE || !value) + return -EINVAL; + + cpuratios[i] = value; + } + + p = end + 1; + } + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = &uksm_scan_ladder[i]; + + rung->cpu_ratio = cpuratios[i]; + } + + return count; +} +UKSM_ATTR(cpu_ratios); + +static ssize_t eval_intervals_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, size; + struct scan_rung *rung; + char *p = buf; + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = &uksm_scan_ladder[i]; + size = sprintf(p, "%u ", rung->cover_msecs); + p += size; + } + + *p++ = '\n'; + *p = '\0'; + + return p - buf; +} + +static ssize_t eval_intervals_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int i, err; + unsigned long values[SCAN_LADDER_SIZE]; + struct scan_rung *rung; + char *p, *end = NULL; + + p = kzalloc(count, GFP_KERNEL); + if (!p) + return -ENOMEM; + + memcpy(p, buf, count); + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + if (i != SCAN_LADDER_SIZE -1) { + end = strchr(p, ' '); + if (!end) + return -EINVAL; + + *end = '\0'; + } + + err = kstrtoul(p, 10, &values[i]); + if (err) + return -EINVAL; + + p = end + 1; + } + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = &uksm_scan_ladder[i]; + + rung->cover_msecs = values[i]; + } + + return count; +} +UKSM_ATTR(eval_intervals); + +static ssize_t ema_per_page_time_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", uksm_ema_page_time); +} +UKSM_ATTR_RO(ema_per_page_time); + +static ssize_t pages_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", uksm_pages_shared); +} +UKSM_ATTR_RO(pages_shared); + +static ssize_t pages_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", uksm_pages_sharing); +} +UKSM_ATTR_RO(pages_sharing); + +static ssize_t pages_unshared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", uksm_pages_unshared); +} +UKSM_ATTR_RO(pages_unshared); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", fully_scanned_round); +} +UKSM_ATTR_RO(full_scans); + +static ssize_t pages_scanned_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned long base = 0; + u64 delta, ret; + + if (pages_scanned_stored) { + base = pages_scanned_base; + ret = pages_scanned_stored; + delta = uksm_pages_scanned >> base; + if (CAN_OVERFLOW_U64(ret, delta)) { + ret >>= 1; + delta >>= 1; + base++; + ret += delta; + } + } else { + ret = uksm_pages_scanned; + } + + while (ret > ULONG_MAX) { + ret >>= 1; + base++; + } + + if (base) + return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); + else + return sprintf(buf, "%lu\n", (unsigned long)ret); +} +UKSM_ATTR_RO(pages_scanned); + +static ssize_t hash_strength_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", hash_strength); +} +UKSM_ATTR_RO(hash_strength); + +static ssize_t sleep_times_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", uksm_sleep_times); +} +UKSM_ATTR_RO(sleep_times); + + +static struct attribute *uksm_attrs[] = { + &max_cpu_percentage_attr.attr, + &sleep_millisecs_attr.attr, + &cpu_governor_attr.attr, + &run_attr.attr, + &ema_per_page_time_attr.attr, + &pages_shared_attr.attr, + &pages_sharing_attr.attr, + &pages_unshared_attr.attr, + &full_scans_attr.attr, + &pages_scanned_attr.attr, + &hash_strength_attr.attr, + &sleep_times_attr.attr, + &thrash_threshold_attr.attr, + &abundant_threshold_attr.attr, + &cpu_ratios_attr.attr, + &eval_intervals_attr.attr, + NULL, +}; + +static struct attribute_group uksm_attr_group = { + .attrs = uksm_attrs, + .name = "uksm", +}; +#endif /* CONFIG_SYSFS */ + +static inline void init_scan_ladder(void) +{ + int i; + struct scan_rung *rung; + + for (i = 0; i < SCAN_LADDER_SIZE; i++) { + rung = uksm_scan_ladder + i; + slot_tree_init_root(&rung->vma_root); + } + + init_performance_values(); + uksm_calc_scan_pages(); +} + +static inline int cal_positive_negative_costs(void) +{ + struct page *p1, *p2; + unsigned char *addr1, *addr2; + unsigned long i, time_start, hash_cost; + unsigned long loopnum = 0; + + /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ + volatile u32 hash; + volatile int ret; + + p1 = alloc_page(GFP_KERNEL); + if (!p1) + return -ENOMEM; + + p2 = alloc_page(GFP_KERNEL); + if (!p2) + return -ENOMEM; + + addr1 = kmap_atomic(p1); + addr2 = kmap_atomic(p2); + memset(addr1, prandom_u32(), PAGE_SIZE); + memcpy(addr2, addr1, PAGE_SIZE); + + /* make sure that the two pages differ in last byte */ + addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; + kunmap_atomic(addr2); + kunmap_atomic(addr1); + + time_start = jiffies; + while (jiffies - time_start < 100) { + for (i = 0; i < 100; i++) + hash = page_hash(p1, HASH_STRENGTH_FULL, 0); + loopnum += 100; + } + hash_cost = (jiffies - time_start); + + time_start = jiffies; + for (i = 0; i < loopnum; i++) + ret = pages_identical(p1, p2); + memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); + memcmp_cost /= hash_cost; + printk(KERN_INFO "UKSM: relative memcmp_cost = %lu " + "hash=%u cmp_ret=%d.\n", + memcmp_cost, hash, ret); + + __free_page(p1); + __free_page(p2); + return 0; +} + +static int init_zeropage_hash_table(void) +{ + struct page *page; + char *addr; + int i; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + addr = kmap_atomic(page); + memset(addr, 0, PAGE_SIZE); + kunmap_atomic(addr); + + zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32), + GFP_KERNEL); + if (!zero_hash_table) + return -ENOMEM; + + for (i = 0; i < HASH_STRENGTH_MAX; i++) + zero_hash_table[i] = page_hash(page, i, 0); + + __free_page(page); + + return 0; +} + +static inline int init_random_sampling(void) +{ + unsigned long i; + random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!random_nums) + return -ENOMEM; + + for (i = 0; i < HASH_STRENGTH_FULL; i++) + random_nums[i] = i; + + for (i = 0; i < HASH_STRENGTH_FULL; i++) { + unsigned long rand_range, swap_index, tmp; + + rand_range = HASH_STRENGTH_FULL - i; + swap_index = i + prandom_u32() % rand_range; + tmp = random_nums[i]; + random_nums[i] = random_nums[swap_index]; + random_nums[swap_index] = tmp; + } + + rshash_state.state = RSHASH_NEW; + rshash_state.below_count = 0; + rshash_state.lookup_window_index = 0; + + return cal_positive_negative_costs(); +} + +static int __init uksm_slab_init(void) +{ + rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); + if (!rmap_item_cache) + goto out; + + stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); + if (!stable_node_cache) + goto out_free1; + + node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); + if (!node_vma_cache) + goto out_free2; + + vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); + if (!vma_slot_cache) + goto out_free3; + + tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); + if (!tree_node_cache) + goto out_free4; + + return 0; + +out_free4: + kmem_cache_destroy(vma_slot_cache); +out_free3: + kmem_cache_destroy(node_vma_cache); +out_free2: + kmem_cache_destroy(stable_node_cache); +out_free1: + kmem_cache_destroy(rmap_item_cache); +out: + return -ENOMEM; +} + +static void __init uksm_slab_free(void) +{ + kmem_cache_destroy(stable_node_cache); + kmem_cache_destroy(rmap_item_cache); + kmem_cache_destroy(node_vma_cache); + kmem_cache_destroy(vma_slot_cache); + kmem_cache_destroy(tree_node_cache); +} + +/* Common interface to ksm, different to it. */ +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + int err; + + switch (advice) { + case MADV_MERGEABLE: + return 0; /* just ignore the advice */ + + case MADV_UNMERGEABLE: + if (!(*vm_flags & VM_MERGEABLE)) + return 0; /* just ignore the advice */ + + if (vma->anon_vma) { + err = unmerge_uksm_pages(vma, start, end); + if (err) + return err; + } + + uksm_remove_vma(vma); + *vm_flags &= ~VM_MERGEABLE; + break; + } + + return 0; +} + +/* Common interface to ksm, actually the same. */ +struct page *ksm_might_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = page_anon_vma(page); + struct page *new_page; + + if (PageKsm(page)) { + if (page_stable_node(page)) + return page; /* no need to copy it */ + } else if (!anon_vma) { + return page; /* no need to copy it */ + } else if (anon_vma->root == vma->anon_vma->root && + page->index == linear_page_index(vma, address)) { + return page; /* still no need to copy it */ + } + if (!PageUptodate(page)) + return page; /* let do_swap_page report the error */ + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page) { + copy_user_highpage(new_page, page, address, vma); + + SetPageDirty(new_page); + __SetPageUptodate(new_page); + __set_page_locked(new_page); + } + + return new_page; +} + +static int __init uksm_init(void) +{ + struct task_struct *uksm_thread; + int err; + + uksm_sleep_jiffies = msecs_to_jiffies(100); + uksm_sleep_saved = uksm_sleep_jiffies; + + slot_tree_init(); + init_scan_ladder(); + + + err = init_random_sampling(); + if (err) + goto out_free2; + + err = uksm_slab_init(); + if (err) + goto out_free1; + + err = init_zeropage_hash_table(); + if (err) + goto out_free0; + + uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); + if (IS_ERR(uksm_thread)) { + printk(KERN_ERR "uksm: creating kthread failed\n"); + err = PTR_ERR(uksm_thread); + goto out_free; + } + +#ifdef CONFIG_SYSFS + err = sysfs_create_group(mm_kobj, &uksm_attr_group); + if (err) { + printk(KERN_ERR "uksm: register sysfs failed\n"); + kthread_stop(uksm_thread); + goto out_free; + } +#else + uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ + +#endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MEMORY_HOTREMOVE + /* + * Choose a high priority since the callback takes uksm_thread_mutex: + * later callbacks could only be taking locks which nest within that. + */ + hotplug_memory_notifier(uksm_memory_callback, 100); +#endif + return 0; + +out_free: + kfree(zero_hash_table); +out_free0: + uksm_slab_free(); +out_free1: + kfree(random_nums); +out_free2: + kfree(uksm_scan_ladder); + return err; +} + +#ifdef MODULE +subsys_initcall(ksm_init); +#else +late_initcall(uksm_init); +#endif + diff -Nur linux-4.2/mm/vmscan.c linux-4.2-pck/mm/vmscan.c --- linux-4.2/mm/vmscan.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/vmscan.c 2015-09-08 11:20:32.560340751 -0300 @@ -1446,7 +1446,7 @@ { unsigned long inactive, isolated; - if (current_is_kswapd()) + if (current_is_kswapd() || sc->hibernation_mode) return 0; if (!sane_reclaim(sc)) @@ -2295,6 +2295,9 @@ unsigned long pages_for_compaction; unsigned long inactive_lru_pages; + if (nr_reclaimed && nr_scanned && sc->nr_to_reclaim >= sc->nr_reclaimed) + return true; + /* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) return false; @@ -2605,6 +2608,12 @@ unsigned long total_scanned = 0; unsigned long writeback_threshold; bool zones_reclaimable; + +#ifdef CONFIG_FREEZER + if (unlikely(pm_freezing && !sc->hibernation_mode)) + return 0; +#endif + retry: delayacct_freepages_start(); @@ -3488,6 +3497,11 @@ if (!populated_zone(zone)) return; +#ifdef CONFIG_FREEZER + if (pm_freezing) + return; +#endif + if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; @@ -3513,7 +3527,7 @@ * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ -unsigned long shrink_all_memory(unsigned long nr_to_reclaim) +unsigned long shrink_memory_mask(unsigned long nr_to_reclaim, gfp_t mask) { struct reclaim_state reclaim_state; struct scan_control sc = { @@ -3542,6 +3556,11 @@ return nr_reclaimed; } + +unsigned long shrink_all_memory(unsigned long nr_to_reclaim) +{ + return shrink_memory_mask(nr_to_reclaim, GFP_HIGHUSER_MOVABLE); +} #endif /* CONFIG_HIBERNATION */ /* It's optimal to keep kswapds on the same CPUs as their memory, but diff -Nur linux-4.2/mm/vmstat.c linux-4.2-pck/mm/vmstat.c --- linux-4.2/mm/vmstat.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/mm/vmstat.c 2015-09-08 00:51:03.470667662 -0300 @@ -740,6 +740,9 @@ "nr_anon_transparent_hugepages", "nr_free_cma", +#ifdef CONFIG_UKSM + "nr_uksm_zero_pages", +#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", diff -Nur linux-4.2/net/bridge/br_multicast.c linux-4.2-pck/net/bridge/br_multicast.c --- linux-4.2/net/bridge/br_multicast.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/bridge/br_multicast.c 2015-09-15 00:15:03.224976622 -0300 @@ -991,7 +991,7 @@ ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); - len = sizeof(*ih); + len = skb_transport_offset(skb) + sizeof(*ih); for (i = 0; i < num; i++) { len += sizeof(*grec); @@ -1052,7 +1052,7 @@ icmp6h = icmp6_hdr(skb); num = ntohs(icmp6h->icmp6_dataun.un_data16[1]); - len = sizeof(*icmp6h); + len = skb_transport_offset(skb) + sizeof(*icmp6h); for (i = 0; i < num; i++) { __be16 *nsrcs, _nsrcs; diff -Nur linux-4.2/net/core/secure_seq.c linux-4.2-pck/net/core/secure_seq.c --- linux-4.2/net/core/secure_seq.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/core/secure_seq.c 2015-09-08 00:48:31.501257347 -0300 @@ -8,7 +8,11 @@ #include #include #include +#include +#include +#include +#include #include #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) @@ -39,6 +43,102 @@ } #endif +#ifdef CONFIG_TCP_STEALTH +u32 tcp_stealth_sequence_number(struct sock *sk, __be32 *daddr, + u32 daddr_size, __be16 dport) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_key *md5; + + __u32 sec[MD5_MESSAGE_BYTES / sizeof(__u32)]; + __u32 i; + __u32 tsval = 0; + + __be32 iv[MD5_DIGEST_WORDS] = { 0 }; + __be32 isn; + + memcpy(iv, daddr, (daddr_size > sizeof(iv)) ? sizeof(iv) : daddr_size); + +#ifdef CONFIG_TCP_MD5SIG + md5 = tp->af_specific->md5_lookup(sk, sk); +#else + md5 = NULL; +#endif + if (likely(sysctl_tcp_timestamps && !md5) || tp->stealth.saw_tsval) + tsval = tp->stealth.mstamp.stamp_jiffies; + + ((__be16 *)iv)[2] ^= cpu_to_be16(tp->stealth.integrity_hash); + iv[2] ^= cpu_to_be32(tsval); + ((__be16 *)iv)[6] ^= dport; + + for (i = 0; i < MD5_DIGEST_WORDS; i++) + iv[i] = le32_to_cpu(iv[i]); + for (i = 0; i < MD5_MESSAGE_BYTES / sizeof(__le32); i++) + sec[i] = le32_to_cpu(((__le32 *)tp->stealth.secret)[i]); + + md5_transform(iv, sec); + + isn = cpu_to_be32(iv[0]) ^ cpu_to_be32(iv[1]) ^ + cpu_to_be32(iv[2]) ^ cpu_to_be32(iv[3]); + + if (tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY) + be32_isn_to_be16_ih(isn) = + cpu_to_be16(tp->stealth.integrity_hash); + + return be32_to_cpu(isn); +} +EXPORT_SYMBOL(tcp_stealth_sequence_number); + +u32 tcp_stealth_do_auth(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcphdr *th = tcp_hdr(skb); + __be32 isn = th->seq; + __be32 hash; + __be32 *daddr; + u32 daddr_size; + + tp->stealth.saw_tsval = + tcp_parse_tsval_option(&tp->stealth.mstamp.stamp_jiffies, th); + + if (tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) + tp->stealth.integrity_hash = + be16_to_cpu(be32_isn_to_be16_ih(isn)); + + switch (tp->inet_conn.icsk_inet.sk.sk_family) { +#if IS_ENABLED(CONFIG_IPV6) + case PF_INET6: + daddr_size = sizeof(ipv6_hdr(skb)->daddr.s6_addr32); + daddr = ipv6_hdr(skb)->daddr.s6_addr32; + break; +#endif + case PF_INET: + daddr_size = sizeof(ip_hdr(skb)->daddr); + daddr = &ip_hdr(skb)->daddr; + break; + default: + pr_err("TCP Stealth: Unknown network layer protocol, stop!\n"); + return 1; + } + + hash = tcp_stealth_sequence_number(sk, daddr, daddr_size, th->dest); + cpu_to_be32s(&hash); + + if (tp->stealth.mode & TCP_STEALTH_MODE_AUTH && + tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN && + be32_isn_to_be16_av(isn) == be32_isn_to_be16_av(hash)) + return 0; + + if (tp->stealth.mode & TCP_STEALTH_MODE_AUTH && + !(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) && + isn == hash) + return 0; + + return 1; +} +EXPORT_SYMBOL(tcp_stealth_do_auth); +#endif + #if IS_ENABLED(CONFIG_IPV6) __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport) diff -Nur linux-4.2/net/ethernet/eth.c linux-4.2-pck/net/ethernet/eth.c --- linux-4.2/net/ethernet/eth.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ethernet/eth.c 2015-09-08 00:48:22.241696581 -0300 @@ -355,7 +355,7 @@ dev->hard_header_len = ETH_HLEN; dev->mtu = ETH_DATA_LEN; dev->addr_len = ETH_ALEN; - dev->tx_queue_len = 1000; /* Ethernet wants good queues */ + dev->tx_queue_len = 50; /* Ethernet wants good latency. Use FreeBSD defaults. */ dev->flags = IFF_BROADCAST|IFF_MULTICAST; dev->priv_flags |= IFF_TX_SKB_SHARING; diff -Nur linux-4.2/net/ipv4/Kconfig linux-4.2-pck/net/ipv4/Kconfig --- linux-4.2/net/ipv4/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv4/Kconfig 2015-09-08 00:48:31.501257347 -0300 @@ -653,6 +653,9 @@ config DEFAULT_VEGAS bool "Vegas" if TCP_CONG_VEGAS=y + config DEFAULT_YEAH + bool "YeAH" if TCP_CONG_YEAH=y + config DEFAULT_VENO bool "Veno" if TCP_CONG_VENO=y @@ -683,6 +686,7 @@ default "htcp" if DEFAULT_HTCP default "hybla" if DEFAULT_HYBLA default "vegas" if DEFAULT_VEGAS + default "yeah" if DEFAULT_YEAH default "westwood" if DEFAULT_WESTWOOD default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO @@ -700,3 +704,13 @@ on the Internet. If unsure, say N. + +config TCP_STEALTH + bool "TCP: Stealth TCP socket support" + default n + ---help--- + This option enables support for stealth TCP sockets. If you do not + know what this means, you do not need it. + + If unsure, say N. + diff -Nur linux-4.2/net/ipv4/tcp.c linux-4.2-pck/net/ipv4/tcp.c --- linux-4.2/net/ipv4/tcp.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv4/tcp.c 2015-09-08 14:26:11.173759241 -0300 @@ -269,6 +269,7 @@ #include #include #include +#include #include #include @@ -2313,6 +2314,43 @@ return 0; } +#ifdef CONFIG_TCP_STEALTH +int tcp_stealth_integrity(__be16 *hash, u8 *secret, u8 *payload, int len) +{ + struct scatterlist sg[2]; + struct crypto_hash *tfm; + struct hash_desc desc; + __be16 h[MD5_DIGEST_WORDS * 2]; + int i; + int err = 0; + + tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + err = -PTR_ERR(tfm); + goto out; + } + desc.tfm = tfm; + desc.flags = 0; + + sg_init_table(sg, 2); + sg_set_buf(&sg[0], secret, MD5_MESSAGE_BYTES); + sg_set_buf(&sg[1], payload, len); + + if (crypto_hash_digest(&desc, sg, MD5_MESSAGE_BYTES + len, (u8 *)h)) { + err = -EFAULT; + goto out; + } + + *hash = be16_to_cpu(h[0]); + for (i = 1; i < MD5_DIGEST_WORDS * 2; i++) + *hash ^= be16_to_cpu(h[i]); + +out: + crypto_free_hash(tfm); + return err; +} +#endif + /* * Socket option code for TCP. */ @@ -2343,6 +2381,66 @@ release_sock(sk); return err; } +#ifdef CONFIG_TCP_STEALTH + case TCP_STEALTH: { + u8 secret[MD5_MESSAGE_BYTES] = { 0 }; + + val = copy_from_user(secret, optval, + min_t(unsigned int, optlen, + MD5_MESSAGE_BYTES)); + if (val != 0) + return -EFAULT; + + lock_sock(sk); + memcpy(tp->stealth.secret, secret, MD5_MESSAGE_BYTES); + tp->stealth.mode = TCP_STEALTH_MODE_AUTH; + tp->stealth.mstamp.v64 = 0; + tp->stealth.saw_tsval = false; + release_sock(sk); + return err; + } + case TCP_STEALTH_INTEGRITY: { + u8 *payload; + + lock_sock(sk); + + if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) { + err = -EOPNOTSUPP; + goto stealth_integrity_out_1; + } + + if (optlen < 1 || optlen > USHRT_MAX) { + err = -EINVAL; + goto stealth_integrity_out_1; + } + + payload = vmalloc(optlen); + if (!payload) { + err = -ENOMEM; + goto stealth_integrity_out_1; + } + + val = copy_from_user(payload, optval, optlen); + if (val != 0) { + err = -EFAULT; + goto stealth_integrity_out_2; + } + + err = tcp_stealth_integrity(&tp->stealth.integrity_hash, + tp->stealth.secret, payload, + optlen); + if (err) + goto stealth_integrity_out_2; + + tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY; + +stealth_integrity_out_2: + vfree(payload); +stealth_integrity_out_1: + release_sock(sk); + return err; + } +#endif default: /* fallthru */ break; @@ -2594,6 +2692,18 @@ tp->notsent_lowat = val; sk->sk_write_space(sk); break; +#ifdef CONFIG_TCP_STEALTH + case TCP_STEALTH_INTEGRITY_LEN: + if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) { + err = -EOPNOTSUPP; + } else if (val < 1 || val > USHRT_MAX) { + err = -EINVAL; + } else { + tp->stealth.integrity_len = val; + tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY_LEN; + } + break; +#endif default: err = -ENOPROTOOPT; break; diff -Nur linux-4.2/net/ipv4/tcp_input.c linux-4.2-pck/net/ipv4/tcp_input.c --- linux-4.2/net/ipv4/tcp_input.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv4/tcp_input.c 2015-09-08 00:48:31.504590523 -0300 @@ -77,6 +77,9 @@ #include int sysctl_tcp_timestamps __read_mostly = 1; +#ifdef CONFIG_TCP_STEALTH +EXPORT_SYMBOL(sysctl_tcp_timestamps); +#endif int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; @@ -3796,6 +3799,47 @@ return true; } +#ifdef CONFIG_TCP_STEALTH +/* Parse only the TSVal field of the TCP Timestamp option header. + */ +const bool tcp_parse_tsval_option(u32 *tsval, const struct tcphdr *th) +{ + int length = (th->doff << 2) - sizeof(*th); + const u8 *ptr = (const u8 *)(th + 1); + + /* If the TCP option is too short, we can short cut */ + if (length < TCPOLEN_TIMESTAMP) + return false; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return false; + case TCPOPT_NOP: + length--; + continue; + case TCPOPT_TIMESTAMP: + opsize = *ptr++; + if (opsize != TCPOLEN_TIMESTAMP || opsize > length) + return false; + *tsval = get_unaligned_be32(ptr); + return true; + default: + opsize = *ptr++; + if (opsize < 2 || opsize > length) + return false; + } + ptr += opsize - 2; + length -= opsize; + } + return false; +} +EXPORT_SYMBOL(tcp_parse_tsval_option); +#endif + #ifdef CONFIG_TCP_MD5SIG /* * Parse MD5 Signature option @@ -4465,6 +4509,31 @@ return -ENOMEM; } +#ifdef CONFIG_TCP_STEALTH +static int __tcp_stealth_integrity_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + u16 hash; + __be32 seq = cpu_to_be32(TCP_SKB_CB(skb)->seq - 1); + char *data = skb->data + th->doff * 4; + int len = skb->len - th->doff * 4; + + if (len < tp->stealth.integrity_len) + return 1; + + if (tcp_stealth_integrity(&hash, tp->stealth.secret, data, + tp->stealth.integrity_len)) + return 1; + + if (be32_isn_to_be16_ih(seq) != cpu_to_be16(hash)) + return 1; + + tp->stealth.mode &= ~TCP_STEALTH_MODE_INTEGRITY_LEN; + return 0; +} +#endif + static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -4474,6 +4543,14 @@ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) goto drop; +#ifdef CONFIG_TCP_STEALTH + if (unlikely(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) && + __tcp_stealth_integrity_check(sk, skb)) { + tcp_reset(sk); + goto drop; + } +#endif + skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); @@ -5246,6 +5323,15 @@ int eaten = 0; bool fragstolen = false; +#ifdef CONFIG_TCP_STEALTH + if (unlikely(tp->stealth.mode & + TCP_STEALTH_MODE_INTEGRITY_LEN) && + __tcp_stealth_integrity_check(sk, skb)) { + tcp_reset(sk); + goto discard; + } +#endif + if (tp->ucopy.task == current && tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len && diff -Nur linux-4.2/net/ipv4/tcp_ipv4.c linux-4.2-pck/net/ipv4/tcp_ipv4.c --- linux-4.2/net/ipv4/tcp_ipv4.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv4/tcp_ipv4.c 2015-09-08 00:48:31.504590523 -0300 @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -235,6 +236,21 @@ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); +#ifdef CONFIG_TCP_STEALTH + /* If CONFIG_TCP_STEALTH is defined, we need to know the timestamp as + * early as possible and thus move taking the snapshot of tcp_time_stamp + * here. + */ + skb_mstamp_get(&tp->stealth.mstamp); + + if (!tp->write_seq && likely(!tp->repair) && + unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) + tp->write_seq = tcp_stealth_sequence_number(sk, + &inet->inet_daddr, + sizeof(inet->inet_daddr), + usin->sin_port); +#endif + if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, inet->inet_daddr, @@ -1382,6 +1398,8 @@ */ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + struct tcphdr *th = tcp_hdr(skb); struct sock *rsk; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ @@ -1403,6 +1421,15 @@ if (tcp_checksum_complete(skb)) goto csum_err; +#ifdef CONFIG_TCP_STEALTH + if (sk->sk_state == TCP_LISTEN && th->syn && !th->fin && + unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH) && + tcp_stealth_do_auth(sk, skb)) { + rsk = sk; + goto reset; + } +#endif + if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) diff -Nur linux-4.2/net/ipv4/tcp_output.c linux-4.2-pck/net/ipv4/tcp_output.c --- linux-4.2/net/ipv4/tcp_output.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv4/tcp_output.c 2015-09-08 00:48:31.504590523 -0300 @@ -939,6 +939,13 @@ tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); +#ifdef TCP_STEALTH + if (unlikely(tcb->tcp_flags & TCPHDR_SYN && + tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) { + skb->skb_mstamp = tp->stealth.mstamp; + } +#endif + if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); else @@ -3251,7 +3258,15 @@ return -ENOBUFS; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); +#ifdef CONFIG_TCP_STEALTH + /* The timetamp was already made at the time the ISN was generated + * as we need to know its value in the stealth_tcp_sequence_number() + * function. + */ + tp->retrans_stamp = tp->stealth.mstamp.stamp_jiffies; +#else tp->retrans_stamp = tcp_time_stamp; +#endif tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); diff -Nur linux-4.2/net/ipv6/tcp_ipv6.c linux-4.2-pck/net/ipv6/tcp_ipv6.c --- linux-4.2/net/ipv6/tcp_ipv6.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/ipv6/tcp_ipv6.c 2015-09-08 00:48:31.504590523 -0300 @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -278,6 +279,21 @@ ip6_set_txhash(sk); +#ifdef CONFIG_TCP_STEALTH + /* If CONFIG_TCP_STEALTH is defined, we need to know the timestamp as + * early as possible and thus move taking the snapshot of tcp_time_stamp + * here. + */ + skb_mstamp_get(&tp->stealth.mstamp); + + if (!tp->write_seq && likely(!tp->repair) && + unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) + tp->write_seq = tcp_stealth_sequence_number(sk, + sk->sk_v6_daddr.s6_addr32, + sizeof(sk->sk_v6_daddr), + inet->inet_dport); +#endif + if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, @@ -1191,7 +1207,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp_sock *tp; + struct tcp_sock *tp = tcp_sk(sk); + struct tcphdr *th = tcp_hdr(skb); struct sk_buff *opt_skb = NULL; /* Imagine: socket is IPv6. IPv4 packet arrives, @@ -1251,6 +1268,13 @@ if (tcp_checksum_complete(skb)) goto csum_err; +#ifdef CONFIG_TCP_STEALTH + if (sk->sk_state == TCP_LISTEN && th->syn && !th->fin && + tp->stealth.mode & TCP_STEALTH_MODE_AUTH && + tcp_stealth_do_auth(sk, skb)) + goto reset; +#endif + if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_hnd_req(sk, skb); if (!nsk) diff -Nur linux-4.2/net/netfilter/nf_conntrack_core.c linux-4.2-pck/net/netfilter/nf_conntrack_core.c --- linux-4.2/net/netfilter/nf_conntrack_core.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/netfilter/nf_conntrack_core.c 2015-09-08 00:48:22.241696581 -0300 @@ -320,12 +320,13 @@ } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); -static void nf_ct_tmpl_free(struct nf_conn *tmpl) +void nf_ct_tmpl_free(struct nf_conn *tmpl) { nf_ct_ext_destroy(tmpl); nf_ct_ext_free(tmpl); kfree(tmpl); } +EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); static void destroy_conntrack(struct nf_conntrack *nfct) diff -Nur linux-4.2/net/netfilter/nf_synproxy_core.c linux-4.2-pck/net/netfilter/nf_synproxy_core.c --- linux-4.2/net/netfilter/nf_synproxy_core.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/netfilter/nf_synproxy_core.c 2015-09-08 00:48:22.241696581 -0300 @@ -378,7 +378,7 @@ err3: free_percpu(snet->stats); err2: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err1: return err; } diff -Nur linux-4.2/net/netfilter/xt_CT.c linux-4.2-pck/net/netfilter/xt_CT.c --- linux-4.2/net/netfilter/xt_CT.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/net/netfilter/xt_CT.c 2015-09-08 00:48:22.241696581 -0300 @@ -233,7 +233,7 @@ return 0; err3: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err2: nf_ct_l3proto_module_put(par->family); err1: diff -Nur linux-4.2/samples/Kconfig linux-4.2-pck/samples/Kconfig --- linux-4.2/samples/Kconfig 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/samples/Kconfig 2015-09-08 00:48:22.241696581 -0300 @@ -55,6 +55,13 @@ Build an example of how to dynamically add the hello command to the kdb shell. +config SAMPLE_KDBUS + bool "Build kdbus API example" + depends on KDBUS + help + Build an example of how the kdbus API can be used from + userspace. + config SAMPLE_RPMSG_CLIENT tristate "Build rpmsg client sample -- loadable modules only" depends on RPMSG && m diff -Nur linux-4.2/samples/Makefile linux-4.2-pck/samples/Makefile --- linux-4.2/samples/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/samples/Makefile 2015-09-08 00:48:22.241696581 -0300 @@ -1,4 +1,5 @@ # Makefile for Linux samples code obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ trace_events/ livepatch/ \ - hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ + hw_breakpoint/ kfifo/ kdb/ kdbus/ hidraw/ rpmsg/ \ + seccomp/ diff -Nur linux-4.2/samples/kdbus/.gitignore linux-4.2-pck/samples/kdbus/.gitignore --- linux-4.2/samples/kdbus/.gitignore 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/samples/kdbus/.gitignore 2015-09-08 00:48:22.241696581 -0300 @@ -0,0 +1 @@ +kdbus-workers diff -Nur linux-4.2/samples/kdbus/Makefile linux-4.2-pck/samples/kdbus/Makefile --- linux-4.2/samples/kdbus/Makefile 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/samples/kdbus/Makefile 2015-09-08 00:48:22.241696581 -0300 @@ -0,0 +1,9 @@ +# kbuild trick to avoid linker error. Can be omitted if a module is built. +obj- := dummy.o + +hostprogs-$(CONFIG_SAMPLE_KDBUS) += kdbus-workers + +always := $(hostprogs-y) + +HOSTCFLAGS_kdbus-workers.o += -I$(objtree)/usr/include +HOSTLOADLIBES_kdbus-workers := -lrt diff -Nur linux-4.2/samples/kdbus/kdbus-api.h linux-4.2-pck/samples/kdbus/kdbus-api.h --- linux-4.2/samples/kdbus/kdbus-api.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/samples/kdbus/kdbus-api.h 2015-09-08 00:48:22.241696581 -0300 @@ -0,0 +1,114 @@ +#ifndef KDBUS_API_H +#define KDBUS_API_H + +#include +#include + +#define KDBUS_ALIGN8(l) (((l) + 7) & ~7) +#define KDBUS_ITEM_HEADER_SIZE offsetof(struct kdbus_item, data) +#define KDBUS_ITEM_SIZE(s) KDBUS_ALIGN8((s) + KDBUS_ITEM_HEADER_SIZE) +#define KDBUS_ITEM_NEXT(item) \ + (typeof(item))((uint8_t *)(item) + KDBUS_ALIGN8((item)->size)) +#define KDBUS_FOREACH(iter, first, _size) \ + for ((iter) = (first); \ + ((uint8_t *)(iter) < (uint8_t *)(first) + (_size)) && \ + ((uint8_t *)(iter) >= (uint8_t *)(first)); \ + (iter) = (void *)((uint8_t *)(iter) + KDBUS_ALIGN8((iter)->size))) + +static inline int kdbus_cmd_bus_make(int control_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(control_fd, KDBUS_CMD_BUS_MAKE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_endpoint_make(int bus_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(bus_fd, KDBUS_CMD_ENDPOINT_MAKE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_endpoint_update(int ep_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(ep_fd, KDBUS_CMD_ENDPOINT_UPDATE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_hello(int bus_fd, struct kdbus_cmd_hello *cmd) +{ + int ret = ioctl(bus_fd, KDBUS_CMD_HELLO, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_update(int fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(fd, KDBUS_CMD_UPDATE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_byebye(int conn_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_BYEBYE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_free(int conn_fd, struct kdbus_cmd_free *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_FREE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_conn_info(int conn_fd, struct kdbus_cmd_info *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_CONN_INFO, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_bus_creator_info(int conn_fd, struct kdbus_cmd_info *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_BUS_CREATOR_INFO, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_list(int fd, struct kdbus_cmd_list *cmd) +{ + int ret = ioctl(fd, KDBUS_CMD_LIST, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_send(int conn_fd, struct kdbus_cmd_send *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_SEND, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_recv(int conn_fd, struct kdbus_cmd_recv *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_RECV, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_name_acquire(int conn_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_NAME_ACQUIRE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_name_release(int conn_fd, struct kdbus_cmd *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_NAME_RELEASE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_match_add(int conn_fd, struct kdbus_cmd_match *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_MATCH_ADD, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +static inline int kdbus_cmd_match_remove(int conn_fd, struct kdbus_cmd_match *cmd) +{ + int ret = ioctl(conn_fd, KDBUS_CMD_MATCH_REMOVE, cmd); + return (ret < 0) ? (errno > 0 ? -errno : -EINVAL) : 0; +} + +#endif /* KDBUS_API_H */ diff -Nur linux-4.2/samples/kdbus/kdbus-workers.c linux-4.2-pck/samples/kdbus/kdbus-workers.c --- linux-4.2/samples/kdbus/kdbus-workers.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/samples/kdbus/kdbus-workers.c 2015-09-08 00:48:22.241696581 -0300 @@ -0,0 +1,1346 @@ +/* + * Copyright (C) 2013-2015 David Herrmann + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +/* + * Example: Workers + * This program computes prime-numbers based on the sieve of Eratosthenes. The + * master sets up a shared memory region and spawns workers which clear out the + * non-primes. The master reacts to keyboard input and to client-requests to + * control what each worker does. Note that this is in no way meant as efficient + * way to compute primes. It should only serve as example how a master/worker + * concept can be implemented with kdbus used as control messages. + * + * The main process is called the 'master'. It creates a new, private bus which + * will be used between the master and its workers to communicate. The master + * then spawns a fixed number of workers. Whenever a worker dies (detected via + * SIGCHLD), the master spawns a new worker. When done, the master waits for all + * workers to exit, prints a status report and exits itself. + * + * The master process does *not* keep track of its workers. Instead, this + * example implements a PULL model. That is, the master acquires a well-known + * name on the bus which each worker uses to request tasks from the master. If + * there are no more tasks, the master will return an empty task-list, which + * casues a worker to exit immediately. + * + * As tasks can be computationally expensive, we support cancellation. Whenever + * the master process is interrupted, it will drop its well-known name on the + * bus. This causes kdbus to broadcast a name-change notification. The workers + * check for broadcast messages regularly and will exit if they receive one. + * + * This example exists of 4 objects: + * * master: The master object contains the context of the master process. This + * process manages the prime-context, spawns workers and assigns + * prime-ranges to each worker to compute. + * The master itself does not do any prime-computations itself. + * * child: The child object contains the context of a worker. It inherits the + * prime context from its parent (the master) and then creates a new + * bus context to request prime-ranges to compute. + * * prime: The "prime" object is used to abstract how we compute primes. When + * allocated, it prepares a memory region to hold 1 bit for each + * natural number up to a fixed maximum ('MAX_PRIMES'). + * The memory region is backed by a memfd which we share between + * processes. Each worker now gets assigned a range of natural + * numbers which it clears multiples of off the memory region. The + * master process is responsible of distributing all natural numbers + * up to the fixed maximum to its workers. + * * bus: The bus object is an abstraction of the kdbus API. It is pretty + * straightfoward and only manages the connection-fd plus the + * memory-mapped pool in a single object. + * + * This example is in reversed order, which should make it easier to read + * top-down, but requires some forward-declarations. Just ignore those. + */ + +#include +#include +#include + +/* glibc < 2.7 does not ship sys/signalfd.h */ +/* we require kernels with __NR_memfd_create */ +#if __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 7 && defined(__NR_memfd_create) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kdbus-api.h" + +/* FORWARD DECLARATIONS */ + +#define POOL_SIZE (16 * 1024 * 1024) +#define MAX_PRIMES (2UL << 24) +#define WORKER_COUNT (16) +#define PRIME_STEPS (65536 * 4) + +static const char *arg_busname = "example-workers"; +static const char *arg_modname = "kdbus"; +static const char *arg_master = "org.freedesktop.master"; + +static int err_assert(int r_errno, const char *msg, const char *func, int line, + const char *file) +{ + r_errno = (r_errno != 0) ? -abs(r_errno) : -EFAULT; + if (r_errno < 0) { + errno = -r_errno; + fprintf(stderr, "ERR: %s: %m (%s:%d in %s)\n", + msg, func, line, file); + } + return r_errno; +} + +#define err_r(_r, _msg) err_assert((_r), (_msg), __func__, __LINE__, __FILE__) +#define err(_msg) err_r(errno, (_msg)) + +struct prime; +struct bus; +struct master; +struct child; + +struct prime { + int fd; + uint8_t *area; + size_t max; + size_t done; + size_t status; +}; + +static int prime_new(struct prime **out); +static void prime_free(struct prime *p); +static bool prime_done(struct prime *p); +static void prime_consume(struct prime *p, size_t amount); +static int prime_run(struct prime *p, struct bus *cancel, size_t number); +static void prime_print(struct prime *p); + +struct bus { + int fd; + uint8_t *pool; +}; + +static int bus_open_connection(struct bus **out, uid_t uid, const char *name, + uint64_t recv_flags); +static void bus_close_connection(struct bus *b); +static void bus_poool_free_slice(struct bus *b, uint64_t offset); +static int bus_acquire_name(struct bus *b, const char *name); +static int bus_install_name_loss_match(struct bus *b, const char *name); +static int bus_poll(struct bus *b); +static int bus_make(uid_t uid, const char *name); + +struct master { + size_t n_workers; + size_t max_workers; + + int signal_fd; + int control_fd; + + struct prime *prime; + struct bus *bus; +}; + +static int master_new(struct master **out); +static void master_free(struct master *m); +static int master_run(struct master *m); +static int master_poll(struct master *m); +static int master_handle_stdin(struct master *m); +static int master_handle_signal(struct master *m); +static int master_handle_bus(struct master *m); +static int master_reply(struct master *m, const struct kdbus_msg *msg); +static int master_waitpid(struct master *m); +static int master_spawn(struct master *m); + +struct child { + struct bus *bus; + struct prime *prime; +}; + +static int child_new(struct child **out, struct prime *p); +static void child_free(struct child *c); +static int child_run(struct child *c); + +/* END OF FORWARD DECLARATIONS */ + +/* + * This is the main entrypoint of this example. It is pretty straightforward. We + * create a master object, run the computation, print a status report and then + * exit. Nothing particularly interesting here, so lets look into the master + * object... + */ +int main(int argc, char **argv) +{ + struct master *m = NULL; + int r; + + r = master_new(&m); + if (r < 0) + goto out; + + r = master_run(m); + if (r < 0) + goto out; + + if (0) + prime_print(m->prime); + +out: + master_free(m); + if (r < 0 && r != -EINTR) + fprintf(stderr, "failed\n"); + else + fprintf(stderr, "done\n"); + return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS; +} + +/* + * ...this will allocate a new master context. It keeps track of the current + * number of children/workers that are running, manages a signalfd to track + * SIGCHLD, and creates a private kdbus bus. Afterwards, it opens its connection + * to the bus and acquires a well known-name (arg_master). + */ +static int master_new(struct master **out) +{ + struct master *m; + sigset_t smask; + int r; + + m = calloc(1, sizeof(*m)); + if (!m) + return err("cannot allocate master"); + + m->max_workers = WORKER_COUNT; + m->signal_fd = -1; + m->control_fd = -1; + + /* Block SIGINT and SIGCHLD signals */ + sigemptyset(&smask); + sigaddset(&smask, SIGINT); + sigaddset(&smask, SIGCHLD); + sigprocmask(SIG_BLOCK, &smask, NULL); + + m->signal_fd = signalfd(-1, &smask, SFD_CLOEXEC); + if (m->signal_fd < 0) { + r = err("cannot create signalfd"); + goto error; + } + + r = prime_new(&m->prime); + if (r < 0) + goto error; + + m->control_fd = bus_make(getuid(), arg_busname); + if (m->control_fd < 0) { + r = m->control_fd; + goto error; + } + + /* + * Open a bus connection for the master, and require each received + * message to have a metadata item of type KDBUS_ITEM_PIDS attached. + * The current UID is needed to compute the name of the bus node to + * connect to. + */ + r = bus_open_connection(&m->bus, getuid(), + arg_busname, KDBUS_ATTACH_PIDS); + if (r < 0) + goto error; + + /* + * Acquire a well-known name on the bus, so children can address + * messages to the master using KDBUS_DST_ID_NAME as destination-ID + * of messages. + */ + r = bus_acquire_name(m->bus, arg_master); + if (r < 0) + goto error; + + *out = m; + return 0; + +error: + master_free(m); + return r; +} + +/* pretty straightforward destructor of a master object */ +static void master_free(struct master *m) +{ + if (!m) + return; + + bus_close_connection(m->bus); + if (m->control_fd >= 0) + close(m->control_fd); + prime_free(m->prime); + if (m->signal_fd >= 0) + close(m->signal_fd); + free(m); +} + +static int master_run(struct master *m) +{ + int res, r = 0; + + while (!prime_done(m->prime)) { + while (m->n_workers < m->max_workers) { + r = master_spawn(m); + if (r < 0) + break; + } + + r = master_poll(m); + if (r < 0) + break; + } + + if (r < 0) { + bus_close_connection(m->bus); + m->bus = NULL; + } + + while (m->n_workers > 0) { + res = master_poll(m); + if (res < 0) { + if (m->bus) { + bus_close_connection(m->bus); + m->bus = NULL; + } + r = res; + } + } + + return r == -EINTR ? 0 : r; +} + +static int master_poll(struct master *m) +{ + struct pollfd fds[3] = {}; + int r = 0, n = 0; + + /* + * Add stdin, the eventfd and the connection owner file descriptor to + * the pollfd table, and handle incoming traffic on the latter in + * master_handle_bus(). + */ + fds[n].fd = STDIN_FILENO; + fds[n++].events = POLLIN; + fds[n].fd = m->signal_fd; + fds[n++].events = POLLIN; + if (m->bus) { + fds[n].fd = m->bus->fd; + fds[n++].events = POLLIN; + } + + r = poll(fds, n, -1); + if (r < 0) + return err("poll() failed"); + + if (fds[0].revents & POLLIN) + r = master_handle_stdin(m); + else if (fds[0].revents) + r = err("ERR/HUP on stdin"); + if (r < 0) + return r; + + if (fds[1].revents & POLLIN) + r = master_handle_signal(m); + else if (fds[1].revents) + r = err("ERR/HUP on signalfd"); + if (r < 0) + return r; + + if (fds[2].revents & POLLIN) + r = master_handle_bus(m); + else if (fds[2].revents) + r = err("ERR/HUP on bus"); + + return r; +} + +static int master_handle_stdin(struct master *m) +{ + char buf[128]; + ssize_t l; + int r = 0; + + l = read(STDIN_FILENO, buf, sizeof(buf)); + if (l < 0) + return err("cannot read stdin"); + if (l == 0) + return err_r(-EINVAL, "EOF on stdin"); + + while (l-- > 0) { + switch (buf[l]) { + case 'q': + /* quit */ + r = -EINTR; + break; + case '\n': + case ' ': + /* ignore */ + break; + default: + if (isgraph(buf[l])) + fprintf(stderr, "invalid input '%c'\n", buf[l]); + else + fprintf(stderr, "invalid input 0x%x\n", buf[l]); + break; + } + } + + return r; +} + +static int master_handle_signal(struct master *m) +{ + struct signalfd_siginfo val; + ssize_t l; + + l = read(m->signal_fd, &val, sizeof(val)); + if (l < 0) + return err("cannot read signalfd"); + if (l != sizeof(val)) + return err_r(-EINVAL, "invalid data from signalfd"); + + switch (val.ssi_signo) { + case SIGCHLD: + return master_waitpid(m); + case SIGINT: + return err_r(-EINTR, "interrupted"); + default: + return err_r(-EINVAL, "caught invalid signal"); + } +} + +static int master_handle_bus(struct master *m) +{ + struct kdbus_cmd_recv recv = { .size = sizeof(recv) }; + const struct kdbus_msg *msg = NULL; + const struct kdbus_item *item; + const struct kdbus_vec *vec = NULL; + int r = 0; + + /* + * To receive a message, the KDBUS_CMD_RECV ioctl is used. + * It takes an argument of type 'struct kdbus_cmd_recv', which + * will contain information on the received message when the call + * returns. See kdbus.message(7). + */ + r = kdbus_cmd_recv(m->bus->fd, &recv); + /* + * EAGAIN is returned when there is no message waiting on this + * connection. This is not an error - simply bail out. + */ + if (r == -EAGAIN) + return 0; + if (r < 0) + return err_r(r, "cannot receive message"); + + /* + * Messages received by a connection are stored inside the connection's + * pool, at an offset that has been returned in the 'recv' command + * struct above. The value describes the relative offset from the + * start address of the pool. A message is described with + * 'struct kdbus_msg'. See kdbus.message(7). + */ + msg = (void *)(m->bus->pool + recv.msg.offset); + + /* + * A messages describes its actual payload in an array of items. + * KDBUS_FOREACH() is a simple iterator that walks such an array. + * struct kdbus_msg has a field to denote its total size, which is + * needed to determine the number of items in the array. + */ + KDBUS_FOREACH(item, msg->items, + msg->size - offsetof(struct kdbus_msg, items)) { + /* + * An item of type PAYLOAD_OFF describes in-line memory + * stored in the pool at a described offset. That offset is + * relative to the start address of the message header. + * This example program only expects one single item of that + * type, remembers the struct kdbus_vec member of the item + * when it sees it, and bails out if there is more than one + * of them. + */ + if (item->type == KDBUS_ITEM_PAYLOAD_OFF) { + if (vec) { + r = err_r(-EEXIST, + "message with multiple vecs"); + break; + } + vec = &item->vec; + if (vec->size != 1) { + r = err_r(-EINVAL, "invalid message size"); + break; + } + + /* + * MEMFDs are transported as items of type PAYLOAD_MEMFD. + * If such an item is attached, a new file descriptor was + * installed into the task when KDBUS_CMD_RECV was called, and + * its number is stored in item->memfd.fd. + * Implementers *must* handle this item type and close the + * file descriptor when no longer needed in order to prevent + * file descriptor exhaustion. This example program just bails + * out with an error in this case, as memfds are not expected + * in this context. + */ + } else if (item->type == KDBUS_ITEM_PAYLOAD_MEMFD) { + r = err_r(-EINVAL, "message with memfd"); + break; + } + } + if (r < 0) + goto exit; + if (!vec) { + r = err_r(-EINVAL, "empty message"); + goto exit; + } + + switch (*((const uint8_t *)msg + vec->offset)) { + case 'r': { + r = master_reply(m, msg); + break; + } + default: + r = err_r(-EINVAL, "invalid message type"); + break; + } + +exit: + /* + * We are done with the memory slice that was given to us through + * recv.msg.offset. Tell the kernel it can use it for other content + * in the future. See kdbus.pool(7). + */ + bus_poool_free_slice(m->bus, recv.msg.offset); + return r; +} + +static int master_reply(struct master *m, const struct kdbus_msg *msg) +{ + struct kdbus_cmd_send cmd; + struct kdbus_item *item; + struct kdbus_msg *reply; + size_t size, status, p[2]; + int r; + + /* + * This functions sends a message over kdbus. To do this, it uses the + * KDBUS_CMD_SEND ioctl, which takes a command struct argument of type + * 'struct kdbus_cmd_send'. This struct stores a pointer to the actual + * message to send. See kdbus.message(7). + */ + p[0] = m->prime->done; + p[1] = prime_done(m->prime) ? 0 : PRIME_STEPS; + + size = sizeof(*reply); + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_vec)); + + /* Prepare the message to send */ + reply = alloca(size); + memset(reply, 0, size); + reply->size = size; + + /* Each message has a cookie that can be used to send replies */ + reply->cookie = 1; + + /* The payload_type is arbitrary, but it must be non-zero */ + reply->payload_type = 0xdeadbeef; + + /* + * We are sending a reply. Let the kernel know the cookie of the + * message we are replying to. + */ + reply->cookie_reply = msg->cookie; + + /* + * Messages can either be directed to a well-known name (stored as + * string) or to a unique name (stored as number). This example does + * the latter. If the message would be directed to a well-known name + * instead, the message's dst_id field would be set to + * KDBUS_DST_ID_NAME, and the name would be attaches in an item of type + * KDBUS_ITEM_DST_NAME. See below for an example, and also refer to + * kdbus.message(7). + */ + reply->dst_id = msg->src_id; + + /* Our message has exactly one item to store its payload */ + item = reply->items; + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)p; + item->vec.size = sizeof(p); + + /* + * Now prepare the command struct, and reference the message we want + * to send. + */ + memset(&cmd, 0, sizeof(cmd)); + cmd.size = sizeof(cmd); + cmd.msg_address = (uintptr_t)reply; + + /* + * Finally, employ the command on the connection owner + * file descriptor. + */ + r = kdbus_cmd_send(m->bus->fd, &cmd); + if (r < 0) + return err_r(r, "cannot send reply"); + + if (p[1]) { + prime_consume(m->prime, p[1]); + status = m->prime->done * 10000 / m->prime->max; + if (status != m->prime->status) { + m->prime->status = status; + fprintf(stderr, "status: %7.3lf%%\n", + (double)status / 100); + } + } + + return 0; +} + +static int master_waitpid(struct master *m) +{ + pid_t pid; + int r; + + while ((pid = waitpid(-1, &r, WNOHANG)) > 0) { + if (m->n_workers > 0) + --m->n_workers; + if (!WIFEXITED(r)) + r = err_r(-EINVAL, "child died unexpectedly"); + else if (WEXITSTATUS(r) != 0) + r = err_r(-WEXITSTATUS(r), "child failed"); + } + + return r; +} + +static int master_spawn(struct master *m) +{ + struct child *c = NULL; + struct prime *p = NULL; + pid_t pid; + int r; + + /* Spawn off one child and call child_run() inside it */ + + pid = fork(); + if (pid < 0) + return err("cannot fork"); + if (pid > 0) { + /* parent */ + ++m->n_workers; + return 0; + } + + /* child */ + + p = m->prime; + m->prime = NULL; + master_free(m); + + r = child_new(&c, p); + if (r < 0) + goto exit; + + r = child_run(c); + +exit: + child_free(c); + exit(abs(r)); +} + +static int child_new(struct child **out, struct prime *p) +{ + struct child *c; + int r; + + c = calloc(1, sizeof(*c)); + if (!c) + return err("cannot allocate child"); + + c->prime = p; + + /* + * Open a connection to the bus and require each received message to + * carry a list of the well-known names the sendind connection currently + * owns. The current UID is needed in order to determine the name of the + * bus node to connect to. + */ + r = bus_open_connection(&c->bus, getuid(), + arg_busname, KDBUS_ATTACH_NAMES); + if (r < 0) + goto error; + + /* + * Install a kdbus match so the child's connection gets notified when + * the master loses its well-known name. + */ + r = bus_install_name_loss_match(c->bus, arg_master); + if (r < 0) + goto error; + + *out = c; + return 0; + +error: + child_free(c); + return r; +} + +static void child_free(struct child *c) +{ + if (!c) + return; + + bus_close_connection(c->bus); + prime_free(c->prime); + free(c); +} + +static int child_run(struct child *c) +{ + struct kdbus_cmd_send cmd; + struct kdbus_item *item; + struct kdbus_vec *vec = NULL; + struct kdbus_msg *msg; + struct timespec spec; + size_t n, steps, size; + int r = 0; + + /* + * Let's send a message to the master and ask for work. To do this, + * we use the KDBUS_CMD_SEND ioctl, which takes an argument of type + * 'struct kdbus_cmd_send'. This struct stores a pointer to the actual + * message to send. See kdbus.message(7). + */ + size = sizeof(*msg); + size += KDBUS_ITEM_SIZE(strlen(arg_master) + 1); + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_vec)); + + msg = alloca(size); + memset(msg, 0, size); + msg->size = size; + + /* + * Tell the kernel that we expect a reply to this message. This means + * that + * + * a) The remote peer will gain temporary permission to talk to us + * even if it would not be allowed to normally. + * + * b) A timeout value is required. + * + * For asynchronous send commands, if no reply is received, we will + * get a kernel notification with an item of type + * KDBUS_ITEM_REPLY_TIMEOUT attached. + * + * For synchronous send commands (which this example does), the + * ioctl will block until a reply is received or the timeout is + * exceeded. + */ + msg->flags = KDBUS_MSG_EXPECT_REPLY; + + /* Set our cookie. Replies must use this cookie to send their reply. */ + msg->cookie = 1; + + /* The payload_type is arbitrary, but it must be non-zero */ + msg->payload_type = 0xdeadbeef; + + /* + * We are sending our message to the current owner of a well-known + * name. This makes an item of type KDBUS_ITEM_DST_NAME mandatory. + */ + msg->dst_id = KDBUS_DST_ID_NAME; + + /* + * Set the reply timeout to 5 seconds. Timeouts are always set in + * absolute timestamps, based con CLOCK_MONOTONIC. See kdbus.message(7). + */ + clock_gettime(CLOCK_MONOTONIC_COARSE, &spec); + msg->timeout_ns += (5 + spec.tv_sec) * 1000ULL * 1000ULL * 1000ULL; + msg->timeout_ns += spec.tv_nsec; + + /* + * Fill the appended items. First, set the well-known name of the + * destination we want to talk to. + */ + item = msg->items; + item->type = KDBUS_ITEM_DST_NAME; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(arg_master) + 1; + strcpy(item->str, arg_master); + + /* + * The 2nd item contains a vector to memory we want to send. It + * can be content of any type. In our case, we're sending a one-byte + * string only. The memory referenced by this item will be copied into + * the pool of the receiver connection, and does not need to be valid + * after the command is employed. + */ + item = KDBUS_ITEM_NEXT(item); + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)"r"; + item->vec.size = 1; + + /* Set up the command struct and reference the message we prepared */ + memset(&cmd, 0, sizeof(cmd)); + cmd.size = sizeof(cmd); + cmd.msg_address = (uintptr_t)msg; + + /* + * The send commands knows a mode in which it will block until a + * reply to a message is received. This example uses that mode. + * The pool offset to the received reply will be stored in the command + * struct after the send command returned. See below. + */ + cmd.flags = KDBUS_SEND_SYNC_REPLY; + + /* + * Finally, employ the command on the connection owner + * file descriptor. + */ + r = kdbus_cmd_send(c->bus->fd, &cmd); + if (r == -ESRCH || r == -EPIPE || r == -ECONNRESET) + return 0; + if (r < 0) + return err_r(r, "cannot send request to master"); + + /* + * The command was sent with the KDBUS_SEND_SYNC_REPLY flag set, + * and returned successfully, which means that cmd.reply.offset now + * points to a message inside our connection's pool where the reply + * is found. This is equivalent to receiving the reply with + * KDBUS_CMD_RECV, but it doesn't require waiting for the reply with + * poll() and also saves the ioctl to receive the message. + */ + msg = (void *)(c->bus->pool + cmd.reply.offset); + + /* + * A messages describes its actual payload in an array of items. + * KDBUS_FOREACH() is a simple iterator that walks such an array. + * struct kdbus_msg has a field to denote its total size, which is + * needed to determine the number of items in the array. + */ + KDBUS_FOREACH(item, msg->items, + msg->size - offsetof(struct kdbus_msg, items)) { + /* + * An item of type PAYLOAD_OFF describes in-line memory + * stored in the pool at a described offset. That offset is + * relative to the start address of the message header. + * This example program only expects one single item of that + * type, remembers the struct kdbus_vec member of the item + * when it sees it, and bails out if there is more than one + * of them. + */ + if (item->type == KDBUS_ITEM_PAYLOAD_OFF) { + if (vec) { + r = err_r(-EEXIST, + "message with multiple vecs"); + break; + } + vec = &item->vec; + if (vec->size != 2 * sizeof(size_t)) { + r = err_r(-EINVAL, "invalid message size"); + break; + } + /* + * MEMFDs are transported as items of type PAYLOAD_MEMFD. + * If such an item is attached, a new file descriptor was + * installed into the task when KDBUS_CMD_RECV was called, and + * its number is stored in item->memfd.fd. + * Implementers *must* handle this item type close the + * file descriptor when no longer needed in order to prevent + * file descriptor exhaustion. This example program just bails + * out with an error in this case, as memfds are not expected + * in this context. + */ + } else if (item->type == KDBUS_ITEM_PAYLOAD_MEMFD) { + r = err_r(-EINVAL, "message with memfd"); + break; + } + } + if (r < 0) + goto exit; + if (!vec) { + r = err_r(-EINVAL, "empty message"); + goto exit; + } + + n = ((size_t *)((const uint8_t *)msg + vec->offset))[0]; + steps = ((size_t *)((const uint8_t *)msg + vec->offset))[1]; + + while (steps-- > 0) { + ++n; + r = prime_run(c->prime, c->bus, n); + if (r < 0) + break; + r = bus_poll(c->bus); + if (r != 0) { + r = r < 0 ? r : -EINTR; + break; + } + } + +exit: + /* + * We are done with the memory slice that was given to us through + * cmd.reply.offset. Tell the kernel it can use it for other content + * in the future. See kdbus.pool(7). + */ + bus_poool_free_slice(c->bus, cmd.reply.offset); + return r; +} + +/* + * Prime Computation + * + */ + +static int prime_new(struct prime **out) +{ + struct prime *p; + int r; + + p = calloc(1, sizeof(*p)); + if (!p) + return err("cannot allocate prime memory"); + + p->fd = -1; + p->area = MAP_FAILED; + p->max = MAX_PRIMES; + + /* + * Prepare and map a memfd to store the bit-fields for the number + * ranges we want to perform the prime detection on. + */ + p->fd = syscall(__NR_memfd_create, "prime-area", MFD_CLOEXEC); + if (p->fd < 0) { + r = err("cannot create memfd"); + goto error; + } + + r = ftruncate(p->fd, p->max / 8 + 1); + if (r < 0) { + r = err("cannot ftruncate area"); + goto error; + } + + p->area = mmap(NULL, p->max / 8 + 1, PROT_READ | PROT_WRITE, + MAP_SHARED, p->fd, 0); + if (p->area == MAP_FAILED) { + r = err("cannot mmap memfd"); + goto error; + } + + *out = p; + return 0; + +error: + prime_free(p); + return r; +} + +static void prime_free(struct prime *p) +{ + if (!p) + return; + + if (p->area != MAP_FAILED) + munmap(p->area, p->max / 8 + 1); + if (p->fd >= 0) + close(p->fd); + free(p); +} + +static bool prime_done(struct prime *p) +{ + return p->done >= p->max; +} + +static void prime_consume(struct prime *p, size_t amount) +{ + p->done += amount; +} + +static int prime_run(struct prime *p, struct bus *cancel, size_t number) +{ + size_t i, n = 0; + int r; + + if (number < 2 || number > 65535) + return 0; + + for (i = number * number; + i < p->max && i > number; + i += number) { + p->area[i / 8] |= 1 << (i % 8); + + if (!(++n % (1 << 20))) { + r = bus_poll(cancel); + if (r != 0) + return r < 0 ? r : -EINTR; + } + } + + return 0; +} + +static void prime_print(struct prime *p) +{ + size_t i, l = 0; + + fprintf(stderr, "PRIMES:"); + for (i = 0; i < p->max; ++i) { + if (!(p->area[i / 8] & (1 << (i % 8)))) + fprintf(stderr, "%c%7zu", !(l++ % 16) ? '\n' : ' ', i); + } + fprintf(stderr, "\nEND\n"); +} + +static int bus_open_connection(struct bus **out, uid_t uid, const char *name, + uint64_t recv_flags) +{ + struct kdbus_cmd_hello hello; + char path[128]; + struct bus *b; + int r; + + /* + * The 'bus' object is our representation of a kdbus connection which + * stores two details: the connection owner file descriptor, and the + * mmap()ed memory of its associated pool. See kdbus.connection(7) and + * kdbus.pool(7). + */ + b = calloc(1, sizeof(*b)); + if (!b) + return err("cannot allocate bus memory"); + + b->fd = -1; + b->pool = MAP_FAILED; + + /* Compute the name of the bus node to connect to. */ + snprintf(path, sizeof(path), "/sys/fs/%s/%lu-%s/bus", + arg_modname, (unsigned long)uid, name); + b->fd = open(path, O_RDWR | O_CLOEXEC); + if (b->fd < 0) { + r = err("cannot open bus"); + goto error; + } + + /* + * To make a connection to the bus, the KDBUS_CMD_HELLO ioctl is used. + * It takes an argument of type 'struct kdbus_cmd_hello'. + */ + memset(&hello, 0, sizeof(hello)); + hello.size = sizeof(hello); + + /* + * Specify a mask of metadata attach flags, describing metadata items + * that this new connection allows to be sent. + */ + hello.attach_flags_send = _KDBUS_ATTACH_ALL; + + /* + * Specify a mask of metadata attach flags, describing metadata items + * that this new connection wants to be receive along with each message. + */ + hello.attach_flags_recv = recv_flags; + + /* + * A connection may choose the size of its pool, but the number has to + * comply with two rules: a) it must be greater than 0, and b) it must + * be a mulitple of PAGE_SIZE. See kdbus.pool(7). + */ + hello.pool_size = POOL_SIZE; + + /* + * Now employ the command on the file descriptor opened above. + * This command will turn the file descriptor into a connection-owner + * file descriptor that controls the life-time of the connection; once + * it's closed, the connection is shut down. + */ + r = kdbus_cmd_hello(b->fd, &hello); + if (r < 0) { + err_r(r, "HELLO failed"); + goto error; + } + + bus_poool_free_slice(b, hello.offset); + + /* + * Map the pool of the connection. Its size has been set in the + * command struct above. See kdbus.pool(7). + */ + b->pool = mmap(NULL, POOL_SIZE, PROT_READ, MAP_SHARED, b->fd, 0); + if (b->pool == MAP_FAILED) { + r = err("cannot mmap pool"); + goto error; + } + + *out = b; + return 0; + +error: + bus_close_connection(b); + return r; +} + +static void bus_close_connection(struct bus *b) +{ + if (!b) + return; + + /* + * A bus connection is closed by simply calling close() on the + * connection owner file descriptor. The unique name and all owned + * well-known names of the conneciton will disappear. + * See kdbus.connection(7). + */ + if (b->pool != MAP_FAILED) + munmap(b->pool, POOL_SIZE); + if (b->fd >= 0) + close(b->fd); + free(b); +} + +static void bus_poool_free_slice(struct bus *b, uint64_t offset) +{ + struct kdbus_cmd_free cmd = { + .size = sizeof(cmd), + .offset = offset, + }; + int r; + + /* + * Once we're done with a piece of pool memory that was returned + * by a command, we have to call the KDBUS_CMD_FREE ioctl on it so it + * can be reused. The command takes an argument of type + * 'struct kdbus_cmd_free', in which the pool offset of the slice to + * free is stored. The ioctl is employed on the connection owner + * file descriptor. See kdbus.pool(7), + */ + r = kdbus_cmd_free(b->fd, &cmd); + if (r < 0) + err_r(r, "cannot free pool slice"); +} + +static int bus_acquire_name(struct bus *b, const char *name) +{ + struct kdbus_item *item; + struct kdbus_cmd *cmd; + size_t size; + int r; + + /* + * This function acquires a well-known name on the bus through the + * KDBUS_CMD_NAME_ACQUIRE ioctl. This ioctl takes an argument of type + * 'struct kdbus_cmd', which is assembled below. See kdbus.name(7). + */ + size = sizeof(*cmd); + size += KDBUS_ITEM_SIZE(strlen(name) + 1); + + cmd = alloca(size); + memset(cmd, 0, size); + cmd->size = size; + + /* + * The command requires an item of type KDBUS_ITEM_NAME, and its + * content must be a valid bus name. + */ + item = cmd->items; + item->type = KDBUS_ITEM_NAME; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + strcpy(item->str, name); + + /* + * Employ the command on the connection owner file descriptor. + */ + r = kdbus_cmd_name_acquire(b->fd, cmd); + if (r < 0) + return err_r(r, "cannot acquire name"); + + return 0; +} + +static int bus_install_name_loss_match(struct bus *b, const char *name) +{ + struct kdbus_cmd_match *match; + struct kdbus_item *item; + size_t size; + int r; + + /* + * In order to install a match for signal messages, we have to + * assemble a 'struct kdbus_cmd_match' and use it along with the + * KDBUS_CMD_MATCH_ADD ioctl. See kdbus.match(7). + */ + size = sizeof(*match); + size += KDBUS_ITEM_SIZE(sizeof(item->name_change) + strlen(name) + 1); + + match = alloca(size); + memset(match, 0, size); + match->size = size; + + /* + * A match is comprised of many 'rules', each of which describes a + * mandatory detail of the message. All rules of a match must be + * satified in order to make a message pass. + */ + item = match->items; + + /* + * In this case, we're interested in notifications that inform us + * about a well-known name being removed from the bus. + */ + item->type = KDBUS_ITEM_NAME_REMOVE; + item->size = KDBUS_ITEM_HEADER_SIZE + + sizeof(item->name_change) + strlen(name) + 1; + + /* + * We could limit the match further and require a specific unique-ID + * to be the new or the old owner of the name. In this case, however, + * we don't, and allow 'any' id. + */ + item->name_change.old_id.id = KDBUS_MATCH_ID_ANY; + item->name_change.new_id.id = KDBUS_MATCH_ID_ANY; + + /* Copy in the well-known name we're interested in */ + strcpy(item->name_change.name, name); + + /* + * Add the match through the KDBUS_CMD_MATCH_ADD ioctl, employed on + * the connection owner fd. + */ + r = kdbus_cmd_match_add(b->fd, match); + if (r < 0) + return err_r(r, "cannot add match"); + + return 0; +} + +static int bus_poll(struct bus *b) +{ + struct pollfd fds[1] = {}; + int r; + + /* + * A connection endpoint supports poll() and will wake-up the + * task with POLLIN set once a message has arrived. + */ + fds[0].fd = b->fd; + fds[0].events = POLLIN; + r = poll(fds, sizeof(fds) / sizeof(*fds), 0); + if (r < 0) + return err("cannot poll bus"); + return !!(fds[0].revents & POLLIN); +} + +static int bus_make(uid_t uid, const char *name) +{ + struct kdbus_item *item; + struct kdbus_cmd *make; + char path[128], busname[128]; + size_t size; + int r, fd; + + /* + * Compute the full path to the 'control' node. 'arg_modname' may be + * set to a different value than 'kdbus' for development purposes. + * The 'control' node is the primary entry point to kdbus that must be + * used in order to create a bus. See kdbus(7) and kdbus.bus(7). + */ + snprintf(path, sizeof(path), "/sys/fs/%s/control", arg_modname); + + /* + * Compute the bus name. A valid bus name must always be prefixed with + * the EUID of the currently running process in order to avoid name + * conflicts. See kdbus.bus(7). + */ + snprintf(busname, sizeof(busname), "%lu-%s", (unsigned long)uid, name); + + fd = open(path, O_RDWR | O_CLOEXEC); + if (fd < 0) + return err("cannot open control file"); + + /* + * The KDBUS_CMD_BUS_MAKE ioctl takes an argument of type + * 'struct kdbus_cmd', and expects at least two items attached to + * it: one to decribe the bloom parameters to be propagated to + * connections of the bus, and the name of the bus that was computed + * above. Assemble this struct now, and fill it with values. + */ + size = sizeof(*make); + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_bloom_parameter)); + size += KDBUS_ITEM_SIZE(strlen(busname) + 1); + + make = alloca(size); + memset(make, 0, size); + make->size = size; + + /* + * Each item has a 'type' and 'size' field, and must be stored at an + * 8-byte aligned address. The KDBUS_ITEM_NEXT macro is used to advance + * the pointer. See kdbus.item(7) for more details. + */ + item = make->items; + item->type = KDBUS_ITEM_BLOOM_PARAMETER; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(item->bloom_parameter); + item->bloom_parameter.size = 8; + item->bloom_parameter.n_hash = 1; + + /* The name of the new bus is stored in the next item. */ + item = KDBUS_ITEM_NEXT(item); + item->type = KDBUS_ITEM_MAKE_NAME; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(busname) + 1; + strcpy(item->str, busname); + + /* + * Now create the bus via the KDBUS_CMD_BUS_MAKE ioctl and return the + * fd that was used back to the caller of this function. This fd is now + * called a 'bus owner file descriptor', and it controls the life-time + * of the newly created bus; once the file descriptor is closed, the + * bus goes away, and all connections are shut down. See kdbus.bus(7). + */ + r = kdbus_cmd_bus_make(fd, make); + if (r < 0) { + err_r(r, "cannot make bus"); + close(fd); + return r; + } + + return fd; +} + +#else + +#warning "Skipping compilation due to unsupported libc version" + +int main(int argc, char **argv) +{ + fprintf(stderr, + "Compilation of %s was skipped due to unsupported libc.\n", + argv[0]); + + return EXIT_FAILURE; +} + +#endif /* libc sanity check */ diff -Nur linux-4.2/scripts/basic/.fixdep.cmd linux-4.2-pck/scripts/basic/.fixdep.cmd --- linux-4.2/scripts/basic/.fixdep.cmd 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/basic/.fixdep.cmd 2015-09-08 01:26:14.022509469 -0300 @@ -0,0 +1,90 @@ +cmd_scripts/basic/fixdep := gcc -Wp,-MD,scripts/basic/.fixdep.d -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 -o scripts/basic/fixdep scripts/basic/fixdep.c + +source_scripts/basic/fixdep := scripts/basic/fixdep.c + +deps_scripts/basic/fixdep := \ + $(wildcard include/config/his/driver.h) \ + $(wildcard include/config/my/option.h) \ + $(wildcard include/config/.h) \ + $(wildcard include/config/foo.h) \ + $(wildcard include/config/boom.h) \ + /usr/include/stdc-predef.h \ + /usr/include/sys/types.h \ + /usr/include/features.h \ + /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h \ + /usr/include/gnu/stubs.h \ + /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h \ + /usr/include/bits/typesizes.h \ + /usr/include/time.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stddef.h \ + /usr/include/endian.h \ + /usr/include/bits/endian.h \ + /usr/include/bits/byteswap.h \ + /usr/include/bits/byteswap-16.h \ + /usr/include/sys/select.h \ + /usr/include/bits/select.h \ + /usr/include/bits/sigset.h \ + /usr/include/bits/time.h \ + /usr/include/sys/sysmacros.h \ + /usr/include/bits/pthreadtypes.h \ + /usr/include/sys/stat.h \ + /usr/include/bits/stat.h \ + /usr/include/sys/mman.h \ + /usr/include/bits/mman.h \ + /usr/include/bits/mman-linux.h \ + /usr/include/unistd.h \ + /usr/include/bits/posix_opt.h \ + /usr/include/bits/environments.h \ + /usr/include/bits/confname.h \ + /usr/include/getopt.h \ + /usr/include/fcntl.h \ + /usr/include/bits/fcntl.h \ + /usr/include/bits/fcntl-linux.h \ + /usr/include/string.h \ + /usr/include/xlocale.h \ + /usr/include/bits/string.h \ + /usr/include/bits/string2.h \ + /usr/include/stdlib.h \ + /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h \ + /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h \ + /usr/include/bits/stdlib-float.h \ + /usr/include/stdio.h \ + /usr/include/libio.h \ + /usr/include/_G_config.h \ + /usr/include/wchar.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdarg.h \ + /usr/include/bits/stdio_lim.h \ + /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include-fixed/limits.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include-fixed/syslimits.h \ + /usr/include/limits.h \ + /usr/include/bits/posix1_lim.h \ + /usr/include/bits/local_lim.h \ + /usr/include/linux/limits.h \ + /usr/include/bits/posix2_lim.h \ + /usr/include/ctype.h \ + /usr/include/arpa/inet.h \ + /usr/include/netinet/in.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdint.h \ + /usr/include/stdint.h \ + /usr/include/bits/wchar.h \ + /usr/include/sys/socket.h \ + /usr/include/sys/uio.h \ + /usr/include/bits/uio.h \ + /usr/include/bits/socket.h \ + /usr/include/bits/socket_type.h \ + /usr/include/bits/sockaddr.h \ + /usr/include/asm/socket.h \ + /usr/include/asm-generic/socket.h \ + /usr/include/asm/sockios.h \ + /usr/include/asm-generic/sockios.h \ + /usr/include/bits/in.h \ + +scripts/basic/fixdep: $(deps_scripts/basic/fixdep) + +$(deps_scripts/basic/fixdep): Binary files linux-4.2/scripts/basic/fixdep and linux-4.2-pck/scripts/basic/fixdep differ diff -Nur linux-4.2/scripts/kconfig/.mconf.cmd linux-4.2-pck/scripts/kconfig/.mconf.cmd --- linux-4.2/scripts/kconfig/.mconf.cmd 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/.mconf.cmd 2015-09-08 01:26:19.482241328 -0300 @@ -0,0 +1 @@ +cmd_scripts/kconfig/mconf := gcc -o scripts/kconfig/mconf scripts/kconfig/mconf.o scripts/kconfig/zconf.tab.o scripts/kconfig/lxdialog/checklist.o scripts/kconfig/lxdialog/util.o scripts/kconfig/lxdialog/inputbox.o scripts/kconfig/lxdialog/textbox.o scripts/kconfig/lxdialog/yesno.o scripts/kconfig/lxdialog/menubox.o -lncursesw diff -Nur linux-4.2/scripts/kconfig/.mconf.o.cmd linux-4.2-pck/scripts/kconfig/.mconf.o.cmd --- linux-4.2/scripts/kconfig/.mconf.o.cmd 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/.mconf.o.cmd 2015-09-08 01:26:14.832469692 -0300 @@ -0,0 +1,100 @@ +cmd_scripts/kconfig/mconf.o := gcc -Wp,-MD,scripts/kconfig/.mconf.o.d -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 -DCURSES_LOC="" -DNCURSES_WIDECHAR=1 -DLOCALE -c -o scripts/kconfig/mconf.o scripts/kconfig/mconf.c + +source_scripts/kconfig/mconf.o := scripts/kconfig/mconf.c + +deps_scripts/kconfig/mconf.o := \ + $(wildcard include/config/mode.h) \ + $(wildcard include/config/color.h) \ + $(wildcard include/config/.h) \ + /usr/include/stdc-predef.h \ + /usr/include/ctype.h \ + /usr/include/features.h \ + /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h \ + /usr/include/gnu/stubs.h \ + /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h \ + /usr/include/bits/typesizes.h \ + /usr/include/endian.h \ + /usr/include/bits/endian.h \ + /usr/include/bits/byteswap.h \ + /usr/include/bits/byteswap-16.h \ + /usr/include/xlocale.h \ + /usr/include/errno.h \ + /usr/include/bits/errno.h \ + /usr/include/linux/errno.h \ + /usr/include/asm/errno.h \ + /usr/include/asm-generic/errno.h \ + /usr/include/asm-generic/errno-base.h \ + /usr/include/fcntl.h \ + /usr/include/bits/fcntl.h \ + /usr/include/bits/fcntl-linux.h \ + /usr/include/time.h \ + /usr/include/bits/stat.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include-fixed/limits.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include-fixed/syslimits.h \ + /usr/include/limits.h \ + /usr/include/bits/posix1_lim.h \ + /usr/include/bits/local_lim.h \ + /usr/include/linux/limits.h \ + /usr/include/bits/posix2_lim.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdarg.h \ + /usr/include/stdlib.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stddef.h \ + /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h \ + /usr/include/sys/types.h \ + /usr/include/sys/select.h \ + /usr/include/bits/select.h \ + /usr/include/bits/sigset.h \ + /usr/include/bits/time.h \ + /usr/include/sys/sysmacros.h \ + /usr/include/bits/pthreadtypes.h \ + /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h \ + /usr/include/bits/stdlib-float.h \ + /usr/include/string.h \ + /usr/include/bits/string.h \ + /usr/include/bits/string2.h \ + /usr/include/signal.h \ + /usr/include/bits/signum.h \ + /usr/include/bits/siginfo.h \ + /usr/include/bits/sigaction.h \ + /usr/include/bits/sigcontext.h \ + /usr/include/bits/sigstack.h \ + /usr/include/sys/ucontext.h \ + /usr/include/bits/sigthread.h \ + /usr/include/unistd.h \ + /usr/include/bits/posix_opt.h \ + /usr/include/bits/environments.h \ + /usr/include/bits/confname.h \ + /usr/include/getopt.h \ + /usr/include/locale.h \ + /usr/include/bits/locale.h \ + scripts/kconfig/lkc.h \ + $(wildcard include/config/prefix.h) \ + $(wildcard include/config/list.h) \ + $(wildcard include/config/y.h) \ + scripts/kconfig/expr.h \ + $(wildcard include/config/config.h) \ + /usr/include/assert.h \ + /usr/include/stdio.h \ + /usr/include/libio.h \ + /usr/include/_G_config.h \ + /usr/include/wchar.h \ + /usr/include/bits/stdio_lim.h \ + /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h \ + scripts/kconfig/list.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdbool.h \ + /usr/include/libintl.h \ + scripts/kconfig/lkc_proto.h \ + scripts/kconfig/lxdialog/dialog.h \ + /usr/include/curses.h \ + /usr/include/ncurses_dll.h \ + /usr/include/unctrl.h \ + /usr/include/curses.h \ + +scripts/kconfig/mconf.o: $(deps_scripts/kconfig/mconf.o) + +$(deps_scripts/kconfig/mconf.o): diff -Nur linux-4.2/scripts/kconfig/.zconf.tab.o.cmd linux-4.2-pck/scripts/kconfig/.zconf.tab.o.cmd --- linux-4.2/scripts/kconfig/.zconf.tab.o.cmd 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/.zconf.tab.o.cmd 2015-09-08 01:26:16.845704149 -0300 @@ -0,0 +1,101 @@ +cmd_scripts/kconfig/zconf.tab.o := gcc -Wp,-MD,scripts/kconfig/.zconf.tab.o.d -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 -DCURSES_LOC="" -DNCURSES_WIDECHAR=1 -DLOCALE -Iscripts/kconfig -c -o scripts/kconfig/zconf.tab.o scripts/kconfig/zconf.tab.c + +source_scripts/kconfig/zconf.tab.o := scripts/kconfig/zconf.tab.c + +deps_scripts/kconfig/zconf.tab.o := \ + /usr/include/stdc-predef.h \ + /usr/include/ctype.h \ + /usr/include/features.h \ + /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h \ + /usr/include/gnu/stubs.h \ + /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h \ + /usr/include/bits/typesizes.h \ + /usr/include/endian.h \ + /usr/include/bits/endian.h \ + /usr/include/bits/byteswap.h \ + /usr/include/bits/byteswap-16.h \ + /usr/include/xlocale.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdarg.h \ + /usr/include/stdio.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stddef.h \ + /usr/include/libio.h \ + /usr/include/_G_config.h \ + /usr/include/wchar.h \ + /usr/include/bits/stdio_lim.h \ + /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h \ + /usr/include/stdlib.h \ + /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h \ + /usr/include/sys/types.h \ + /usr/include/time.h \ + /usr/include/sys/select.h \ + /usr/include/bits/select.h \ + /usr/include/bits/sigset.h \ + /usr/include/bits/time.h \ + /usr/include/sys/sysmacros.h \ + /usr/include/bits/pthreadtypes.h \ + /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h \ + /usr/include/bits/stdlib-float.h \ + /usr/include/string.h \ + /usr/include/bits/string.h \ + /usr/include/bits/string2.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdbool.h \ + scripts/kconfig/lkc.h \ + $(wildcard include/config/.h) \ + $(wildcard include/config/prefix.h) \ + $(wildcard include/config/list.h) \ + $(wildcard include/config/y.h) \ + scripts/kconfig/expr.h \ + $(wildcard include/config/config.h) \ + /usr/include/assert.h \ + scripts/kconfig/list.h \ + /usr/include/libintl.h \ + /usr/include/locale.h \ + /usr/include/bits/locale.h \ + scripts/kconfig/lkc_proto.h \ + scripts/kconfig/zconf.hash.c \ + scripts/kconfig/zconf.lex.c \ + /usr/include/errno.h \ + /usr/include/bits/errno.h \ + /usr/include/linux/errno.h \ + /usr/include/asm/errno.h \ + /usr/include/asm-generic/errno.h \ + /usr/include/asm-generic/errno-base.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include-fixed/limits.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include-fixed/syslimits.h \ + /usr/include/limits.h \ + /usr/include/bits/posix1_lim.h \ + /usr/include/bits/local_lim.h \ + /usr/include/linux/limits.h \ + /usr/include/bits/posix2_lim.h \ + /usr/include/unistd.h \ + /usr/include/bits/posix_opt.h \ + /usr/include/bits/environments.h \ + /usr/include/bits/confname.h \ + /usr/include/getopt.h \ + scripts/kconfig/util.c \ + scripts/kconfig/confdata.c \ + $(wildcard include/config/autoconfig.h) \ + $(wildcard include/config/overwriteconfig.h) \ + $(wildcard include/config/autoheader.h) \ + $(wildcard include/config/tristate.h) \ + $(wildcard include/config/probability.h) \ + /usr/include/sys/stat.h \ + /usr/include/bits/stat.h \ + /usr/include/fcntl.h \ + /usr/include/bits/fcntl.h \ + /usr/include/bits/fcntl-linux.h \ + scripts/kconfig/expr.c \ + scripts/kconfig/symbol.c \ + /usr/include/regex.h \ + /usr/include/sys/utsname.h \ + /usr/include/bits/utsname.h \ + scripts/kconfig/menu.c \ + +scripts/kconfig/zconf.tab.o: $(deps_scripts/kconfig/zconf.tab.o) + +$(deps_scripts/kconfig/zconf.tab.o): diff -Nur linux-4.2/scripts/kconfig/lxdialog/.checklist.o.cmd linux-4.2-pck/scripts/kconfig/lxdialog/.checklist.o.cmd --- linux-4.2/scripts/kconfig/lxdialog/.checklist.o.cmd 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/lxdialog/.checklist.o.cmd 2015-09-08 01:26:17.362345442 -0300 @@ -0,0 +1,67 @@ +cmd_scripts/kconfig/lxdialog/checklist.o := gcc -Wp,-MD,scripts/kconfig/lxdialog/.checklist.o.d -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 -DCURSES_LOC="" -DNCURSES_WIDECHAR=1 -DLOCALE -c -o scripts/kconfig/lxdialog/checklist.o scripts/kconfig/lxdialog/checklist.c + +source_scripts/kconfig/lxdialog/checklist.o := scripts/kconfig/lxdialog/checklist.c + +deps_scripts/kconfig/lxdialog/checklist.o := \ + /usr/include/stdc-predef.h \ + scripts/kconfig/lxdialog/dialog.h \ + /usr/include/sys/types.h \ + /usr/include/features.h \ + /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h \ + /usr/include/gnu/stubs.h \ + /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h \ + /usr/include/bits/typesizes.h \ + /usr/include/time.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stddef.h \ + /usr/include/endian.h \ + /usr/include/bits/endian.h \ + /usr/include/bits/byteswap.h \ + /usr/include/bits/byteswap-16.h \ + /usr/include/sys/select.h \ + /usr/include/bits/select.h \ + /usr/include/bits/sigset.h \ + /usr/include/bits/time.h \ + /usr/include/sys/sysmacros.h \ + /usr/include/bits/pthreadtypes.h \ + /usr/include/fcntl.h \ + /usr/include/bits/fcntl.h \ + /usr/include/bits/fcntl-linux.h \ + /usr/include/bits/stat.h \ + /usr/include/unistd.h \ + /usr/include/bits/posix_opt.h \ + /usr/include/bits/environments.h \ + /usr/include/bits/confname.h \ + /usr/include/getopt.h \ + /usr/include/ctype.h \ + /usr/include/xlocale.h \ + /usr/include/stdlib.h \ + /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h \ + /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h \ + /usr/include/bits/stdlib-float.h \ + /usr/include/string.h \ + /usr/include/bits/string.h \ + /usr/include/bits/string2.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdbool.h \ + /usr/include/libintl.h \ + /usr/include/locale.h \ + /usr/include/bits/locale.h \ + /usr/include/curses.h \ + /usr/include/ncurses_dll.h \ + /usr/include/stdio.h \ + /usr/include/libio.h \ + /usr/include/_G_config.h \ + /usr/include/wchar.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdarg.h \ + /usr/include/bits/stdio_lim.h \ + /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h \ + /usr/include/unctrl.h \ + /usr/include/curses.h \ + +scripts/kconfig/lxdialog/checklist.o: $(deps_scripts/kconfig/lxdialog/checklist.o) + +$(deps_scripts/kconfig/lxdialog/checklist.o): diff -Nur linux-4.2/scripts/kconfig/lxdialog/.inputbox.o.cmd linux-4.2-pck/scripts/kconfig/lxdialog/.inputbox.o.cmd --- linux-4.2/scripts/kconfig/lxdialog/.inputbox.o.cmd 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/lxdialog/.inputbox.o.cmd 2015-09-08 01:26:18.375629008 -0300 @@ -0,0 +1,67 @@ +cmd_scripts/kconfig/lxdialog/inputbox.o := gcc -Wp,-MD,scripts/kconfig/lxdialog/.inputbox.o.d -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 -DCURSES_LOC="" -DNCURSES_WIDECHAR=1 -DLOCALE -c -o scripts/kconfig/lxdialog/inputbox.o scripts/kconfig/lxdialog/inputbox.c + +source_scripts/kconfig/lxdialog/inputbox.o := scripts/kconfig/lxdialog/inputbox.c + +deps_scripts/kconfig/lxdialog/inputbox.o := \ + /usr/include/stdc-predef.h \ + scripts/kconfig/lxdialog/dialog.h \ + /usr/include/sys/types.h \ + /usr/include/features.h \ + /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h \ + /usr/include/gnu/stubs.h \ + /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h \ + /usr/include/bits/typesizes.h \ + /usr/include/time.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stddef.h \ + /usr/include/endian.h \ + /usr/include/bits/endian.h \ + /usr/include/bits/byteswap.h \ + /usr/include/bits/byteswap-16.h \ + /usr/include/sys/select.h \ + /usr/include/bits/select.h \ + /usr/include/bits/sigset.h \ + /usr/include/bits/time.h \ + /usr/include/sys/sysmacros.h \ + /usr/include/bits/pthreadtypes.h \ + /usr/include/fcntl.h \ + /usr/include/bits/fcntl.h \ + /usr/include/bits/fcntl-linux.h \ + /usr/include/bits/stat.h \ + /usr/include/unistd.h \ + /usr/include/bits/posix_opt.h \ + /usr/include/bits/environments.h \ + /usr/include/bits/confname.h \ + /usr/include/getopt.h \ + /usr/include/ctype.h \ + /usr/include/xlocale.h \ + /usr/include/stdlib.h \ + /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h \ + /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h \ + /usr/include/bits/stdlib-float.h \ + /usr/include/string.h \ + /usr/include/bits/string.h \ + /usr/include/bits/string2.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdbool.h \ + /usr/include/libintl.h \ + /usr/include/locale.h \ + /usr/include/bits/locale.h \ + /usr/include/curses.h \ + /usr/include/ncurses_dll.h \ + /usr/include/stdio.h \ + /usr/include/libio.h \ + /usr/include/_G_config.h \ + /usr/include/wchar.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdarg.h \ + /usr/include/bits/stdio_lim.h \ + /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h \ + /usr/include/unctrl.h \ + /usr/include/curses.h \ + +scripts/kconfig/lxdialog/inputbox.o: $(deps_scripts/kconfig/lxdialog/inputbox.o) + +$(deps_scripts/kconfig/lxdialog/inputbox.o): diff -Nur linux-4.2/scripts/kconfig/lxdialog/.menubox.o.cmd linux-4.2-pck/scripts/kconfig/lxdialog/.menubox.o.cmd --- linux-4.2/scripts/kconfig/lxdialog/.menubox.o.cmd 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/lxdialog/.menubox.o.cmd 2015-09-08 01:26:19.415577936 -0300 @@ -0,0 +1,67 @@ +cmd_scripts/kconfig/lxdialog/menubox.o := gcc -Wp,-MD,scripts/kconfig/lxdialog/.menubox.o.d -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 -DCURSES_LOC="" -DNCURSES_WIDECHAR=1 -DLOCALE -c -o scripts/kconfig/lxdialog/menubox.o scripts/kconfig/lxdialog/menubox.c + +source_scripts/kconfig/lxdialog/menubox.o := scripts/kconfig/lxdialog/menubox.c + +deps_scripts/kconfig/lxdialog/menubox.o := \ + /usr/include/stdc-predef.h \ + scripts/kconfig/lxdialog/dialog.h \ + /usr/include/sys/types.h \ + /usr/include/features.h \ + /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h \ + /usr/include/gnu/stubs.h \ + /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h \ + /usr/include/bits/typesizes.h \ + /usr/include/time.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stddef.h \ + /usr/include/endian.h \ + /usr/include/bits/endian.h \ + /usr/include/bits/byteswap.h \ + /usr/include/bits/byteswap-16.h \ + /usr/include/sys/select.h \ + /usr/include/bits/select.h \ + /usr/include/bits/sigset.h \ + /usr/include/bits/time.h \ + /usr/include/sys/sysmacros.h \ + /usr/include/bits/pthreadtypes.h \ + /usr/include/fcntl.h \ + /usr/include/bits/fcntl.h \ + /usr/include/bits/fcntl-linux.h \ + /usr/include/bits/stat.h \ + /usr/include/unistd.h \ + /usr/include/bits/posix_opt.h \ + /usr/include/bits/environments.h \ + /usr/include/bits/confname.h \ + /usr/include/getopt.h \ + /usr/include/ctype.h \ + /usr/include/xlocale.h \ + /usr/include/stdlib.h \ + /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h \ + /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h \ + /usr/include/bits/stdlib-float.h \ + /usr/include/string.h \ + /usr/include/bits/string.h \ + /usr/include/bits/string2.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdbool.h \ + /usr/include/libintl.h \ + /usr/include/locale.h \ + /usr/include/bits/locale.h \ + /usr/include/curses.h \ + /usr/include/ncurses_dll.h \ + /usr/include/stdio.h \ + /usr/include/libio.h \ + /usr/include/_G_config.h \ + /usr/include/wchar.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdarg.h \ + /usr/include/bits/stdio_lim.h \ + /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h \ + /usr/include/unctrl.h \ + /usr/include/curses.h \ + +scripts/kconfig/lxdialog/menubox.o: $(deps_scripts/kconfig/lxdialog/menubox.o) + +$(deps_scripts/kconfig/lxdialog/menubox.o): diff -Nur linux-4.2/scripts/kconfig/lxdialog/.textbox.o.cmd linux-4.2-pck/scripts/kconfig/lxdialog/.textbox.o.cmd --- linux-4.2/scripts/kconfig/lxdialog/.textbox.o.cmd 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/lxdialog/.textbox.o.cmd 2015-09-08 01:26:18.732278161 -0300 @@ -0,0 +1,67 @@ +cmd_scripts/kconfig/lxdialog/textbox.o := gcc -Wp,-MD,scripts/kconfig/lxdialog/.textbox.o.d -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 -DCURSES_LOC="" -DNCURSES_WIDECHAR=1 -DLOCALE -c -o scripts/kconfig/lxdialog/textbox.o scripts/kconfig/lxdialog/textbox.c + +source_scripts/kconfig/lxdialog/textbox.o := scripts/kconfig/lxdialog/textbox.c + +deps_scripts/kconfig/lxdialog/textbox.o := \ + /usr/include/stdc-predef.h \ + scripts/kconfig/lxdialog/dialog.h \ + /usr/include/sys/types.h \ + /usr/include/features.h \ + /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h \ + /usr/include/gnu/stubs.h \ + /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h \ + /usr/include/bits/typesizes.h \ + /usr/include/time.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stddef.h \ + /usr/include/endian.h \ + /usr/include/bits/endian.h \ + /usr/include/bits/byteswap.h \ + /usr/include/bits/byteswap-16.h \ + /usr/include/sys/select.h \ + /usr/include/bits/select.h \ + /usr/include/bits/sigset.h \ + /usr/include/bits/time.h \ + /usr/include/sys/sysmacros.h \ + /usr/include/bits/pthreadtypes.h \ + /usr/include/fcntl.h \ + /usr/include/bits/fcntl.h \ + /usr/include/bits/fcntl-linux.h \ + /usr/include/bits/stat.h \ + /usr/include/unistd.h \ + /usr/include/bits/posix_opt.h \ + /usr/include/bits/environments.h \ + /usr/include/bits/confname.h \ + /usr/include/getopt.h \ + /usr/include/ctype.h \ + /usr/include/xlocale.h \ + /usr/include/stdlib.h \ + /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h \ + /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h \ + /usr/include/bits/stdlib-float.h \ + /usr/include/string.h \ + /usr/include/bits/string.h \ + /usr/include/bits/string2.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdbool.h \ + /usr/include/libintl.h \ + /usr/include/locale.h \ + /usr/include/bits/locale.h \ + /usr/include/curses.h \ + /usr/include/ncurses_dll.h \ + /usr/include/stdio.h \ + /usr/include/libio.h \ + /usr/include/_G_config.h \ + /usr/include/wchar.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdarg.h \ + /usr/include/bits/stdio_lim.h \ + /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h \ + /usr/include/unctrl.h \ + /usr/include/curses.h \ + +scripts/kconfig/lxdialog/textbox.o: $(deps_scripts/kconfig/lxdialog/textbox.o) + +$(deps_scripts/kconfig/lxdialog/textbox.o): diff -Nur linux-4.2/scripts/kconfig/lxdialog/.util.o.cmd linux-4.2-pck/scripts/kconfig/lxdialog/.util.o.cmd --- linux-4.2/scripts/kconfig/lxdialog/.util.o.cmd 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/lxdialog/.util.o.cmd 2015-09-08 01:26:17.828989191 -0300 @@ -0,0 +1,68 @@ +cmd_scripts/kconfig/lxdialog/util.o := gcc -Wp,-MD,scripts/kconfig/lxdialog/.util.o.d -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 -DCURSES_LOC="" -DNCURSES_WIDECHAR=1 -DLOCALE -c -o scripts/kconfig/lxdialog/util.o scripts/kconfig/lxdialog/util.c + +source_scripts/kconfig/lxdialog/util.o := scripts/kconfig/lxdialog/util.c + +deps_scripts/kconfig/lxdialog/util.o := \ + $(wildcard include/config/color.h) \ + /usr/include/stdc-predef.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdarg.h \ + scripts/kconfig/lxdialog/dialog.h \ + /usr/include/sys/types.h \ + /usr/include/features.h \ + /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h \ + /usr/include/gnu/stubs.h \ + /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h \ + /usr/include/bits/typesizes.h \ + /usr/include/time.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stddef.h \ + /usr/include/endian.h \ + /usr/include/bits/endian.h \ + /usr/include/bits/byteswap.h \ + /usr/include/bits/byteswap-16.h \ + /usr/include/sys/select.h \ + /usr/include/bits/select.h \ + /usr/include/bits/sigset.h \ + /usr/include/bits/time.h \ + /usr/include/sys/sysmacros.h \ + /usr/include/bits/pthreadtypes.h \ + /usr/include/fcntl.h \ + /usr/include/bits/fcntl.h \ + /usr/include/bits/fcntl-linux.h \ + /usr/include/bits/stat.h \ + /usr/include/unistd.h \ + /usr/include/bits/posix_opt.h \ + /usr/include/bits/environments.h \ + /usr/include/bits/confname.h \ + /usr/include/getopt.h \ + /usr/include/ctype.h \ + /usr/include/xlocale.h \ + /usr/include/stdlib.h \ + /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h \ + /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h \ + /usr/include/bits/stdlib-float.h \ + /usr/include/string.h \ + /usr/include/bits/string.h \ + /usr/include/bits/string2.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdbool.h \ + /usr/include/libintl.h \ + /usr/include/locale.h \ + /usr/include/bits/locale.h \ + /usr/include/curses.h \ + /usr/include/ncurses_dll.h \ + /usr/include/stdio.h \ + /usr/include/libio.h \ + /usr/include/_G_config.h \ + /usr/include/wchar.h \ + /usr/include/bits/stdio_lim.h \ + /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h \ + /usr/include/unctrl.h \ + /usr/include/curses.h \ + +scripts/kconfig/lxdialog/util.o: $(deps_scripts/kconfig/lxdialog/util.o) + +$(deps_scripts/kconfig/lxdialog/util.o): diff -Nur linux-4.2/scripts/kconfig/lxdialog/.yesno.o.cmd linux-4.2-pck/scripts/kconfig/lxdialog/.yesno.o.cmd --- linux-4.2/scripts/kconfig/lxdialog/.yesno.o.cmd 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/lxdialog/.yesno.o.cmd 2015-09-08 01:26:19.025597088 -0300 @@ -0,0 +1,67 @@ +cmd_scripts/kconfig/lxdialog/yesno.o := gcc -Wp,-MD,scripts/kconfig/lxdialog/.yesno.o.d -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 -DCURSES_LOC="" -DNCURSES_WIDECHAR=1 -DLOCALE -c -o scripts/kconfig/lxdialog/yesno.o scripts/kconfig/lxdialog/yesno.c + +source_scripts/kconfig/lxdialog/yesno.o := scripts/kconfig/lxdialog/yesno.c + +deps_scripts/kconfig/lxdialog/yesno.o := \ + /usr/include/stdc-predef.h \ + scripts/kconfig/lxdialog/dialog.h \ + /usr/include/sys/types.h \ + /usr/include/features.h \ + /usr/include/sys/cdefs.h \ + /usr/include/bits/wordsize.h \ + /usr/include/gnu/stubs.h \ + /usr/include/gnu/stubs-64.h \ + /usr/include/bits/types.h \ + /usr/include/bits/typesizes.h \ + /usr/include/time.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stddef.h \ + /usr/include/endian.h \ + /usr/include/bits/endian.h \ + /usr/include/bits/byteswap.h \ + /usr/include/bits/byteswap-16.h \ + /usr/include/sys/select.h \ + /usr/include/bits/select.h \ + /usr/include/bits/sigset.h \ + /usr/include/bits/time.h \ + /usr/include/sys/sysmacros.h \ + /usr/include/bits/pthreadtypes.h \ + /usr/include/fcntl.h \ + /usr/include/bits/fcntl.h \ + /usr/include/bits/fcntl-linux.h \ + /usr/include/bits/stat.h \ + /usr/include/unistd.h \ + /usr/include/bits/posix_opt.h \ + /usr/include/bits/environments.h \ + /usr/include/bits/confname.h \ + /usr/include/getopt.h \ + /usr/include/ctype.h \ + /usr/include/xlocale.h \ + /usr/include/stdlib.h \ + /usr/include/bits/waitflags.h \ + /usr/include/bits/waitstatus.h \ + /usr/include/alloca.h \ + /usr/include/bits/stdlib-bsearch.h \ + /usr/include/bits/stdlib-float.h \ + /usr/include/string.h \ + /usr/include/bits/string.h \ + /usr/include/bits/string2.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdbool.h \ + /usr/include/libintl.h \ + /usr/include/locale.h \ + /usr/include/bits/locale.h \ + /usr/include/curses.h \ + /usr/include/ncurses_dll.h \ + /usr/include/stdio.h \ + /usr/include/libio.h \ + /usr/include/_G_config.h \ + /usr/include/wchar.h \ + /usr/lib/gcc/x86_64-unknown-linux-gnu/5.2.0/include/stdarg.h \ + /usr/include/bits/stdio_lim.h \ + /usr/include/bits/sys_errlist.h \ + /usr/include/bits/stdio.h \ + /usr/include/unctrl.h \ + /usr/include/curses.h \ + +scripts/kconfig/lxdialog/yesno.o: $(deps_scripts/kconfig/lxdialog/yesno.o) + +$(deps_scripts/kconfig/lxdialog/yesno.o): Binary files linux-4.2/scripts/kconfig/lxdialog/checklist.o and linux-4.2-pck/scripts/kconfig/lxdialog/checklist.o differ Binary files linux-4.2/scripts/kconfig/lxdialog/inputbox.o and linux-4.2-pck/scripts/kconfig/lxdialog/inputbox.o differ Binary files linux-4.2/scripts/kconfig/lxdialog/menubox.o and linux-4.2-pck/scripts/kconfig/lxdialog/menubox.o differ Binary files linux-4.2/scripts/kconfig/lxdialog/textbox.o and linux-4.2-pck/scripts/kconfig/lxdialog/textbox.o differ Binary files linux-4.2/scripts/kconfig/lxdialog/util.o and linux-4.2-pck/scripts/kconfig/lxdialog/util.o differ Binary files linux-4.2/scripts/kconfig/lxdialog/yesno.o and linux-4.2-pck/scripts/kconfig/lxdialog/yesno.o differ Binary files linux-4.2/scripts/kconfig/mconf and linux-4.2-pck/scripts/kconfig/mconf differ Binary files linux-4.2/scripts/kconfig/mconf.o and linux-4.2-pck/scripts/kconfig/mconf.o differ diff -Nur linux-4.2/scripts/kconfig/zconf.hash.c linux-4.2-pck/scripts/kconfig/zconf.hash.c --- linux-4.2/scripts/kconfig/zconf.hash.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/zconf.hash.c 2015-09-08 01:26:14.849135538 -0300 @@ -0,0 +1,289 @@ +/* ANSI-C code produced by gperf version 3.0.4 */ +/* Command-line: gperf -t --output-file scripts/kconfig/zconf.hash.c_shipped -a -C -E -g -k '1,3,$' -p -t scripts/kconfig/zconf.gperf */ + +#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ + && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \ + && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \ + && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \ + && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \ + && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \ + && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \ + && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \ + && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \ + && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \ + && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \ + && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \ + && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \ + && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \ + && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \ + && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \ + && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \ + && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \ + && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \ + && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \ + && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \ + && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \ + && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to ." +#endif + +#line 10 "scripts/kconfig/zconf.gperf" +struct kconf_id; + +static const struct kconf_id *kconf_id_lookup(register const char *str, register unsigned int len); +/* maximum key range = 71, duplicates = 0 */ + +#ifdef __GNUC__ +__inline +#else +#ifdef __cplusplus +inline +#endif +#endif +static unsigned int +kconf_id_hash (register const char *str, register unsigned int len) +{ + static const unsigned char asso_values[] = + { + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 5, 25, 25, + 0, 0, 0, 5, 0, 0, 73, 73, 5, 0, + 10, 5, 45, 73, 20, 20, 0, 15, 15, 73, + 20, 5, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, + 73, 73, 73, 73, 73, 73 + }; + register int hval = len; + + switch (hval) + { + default: + hval += asso_values[(unsigned char)str[2]]; + /*FALLTHROUGH*/ + case 2: + case 1: + hval += asso_values[(unsigned char)str[0]]; + break; + } + return hval + asso_values[(unsigned char)str[len - 1]]; +} + +struct kconf_id_strings_t + { + char kconf_id_strings_str2[sizeof("if")]; + char kconf_id_strings_str3[sizeof("int")]; + char kconf_id_strings_str5[sizeof("endif")]; + char kconf_id_strings_str7[sizeof("default")]; + char kconf_id_strings_str8[sizeof("tristate")]; + char kconf_id_strings_str9[sizeof("endchoice")]; + char kconf_id_strings_str12[sizeof("def_tristate")]; + char kconf_id_strings_str13[sizeof("def_bool")]; + char kconf_id_strings_str14[sizeof("defconfig_list")]; + char kconf_id_strings_str17[sizeof("on")]; + char kconf_id_strings_str18[sizeof("optional")]; + char kconf_id_strings_str21[sizeof("option")]; + char kconf_id_strings_str22[sizeof("endmenu")]; + char kconf_id_strings_str23[sizeof("mainmenu")]; + char kconf_id_strings_str25[sizeof("menuconfig")]; + char kconf_id_strings_str27[sizeof("modules")]; + char kconf_id_strings_str28[sizeof("allnoconfig_y")]; + char kconf_id_strings_str29[sizeof("menu")]; + char kconf_id_strings_str31[sizeof("select")]; + char kconf_id_strings_str32[sizeof("comment")]; + char kconf_id_strings_str33[sizeof("env")]; + char kconf_id_strings_str35[sizeof("range")]; + char kconf_id_strings_str36[sizeof("choice")]; + char kconf_id_strings_str39[sizeof("bool")]; + char kconf_id_strings_str41[sizeof("source")]; + char kconf_id_strings_str42[sizeof("visible")]; + char kconf_id_strings_str43[sizeof("hex")]; + char kconf_id_strings_str46[sizeof("config")]; + char kconf_id_strings_str47[sizeof("boolean")]; + char kconf_id_strings_str51[sizeof("string")]; + char kconf_id_strings_str54[sizeof("help")]; + char kconf_id_strings_str56[sizeof("prompt")]; + char kconf_id_strings_str72[sizeof("depends")]; + }; +static const struct kconf_id_strings_t kconf_id_strings_contents = + { + "if", + "int", + "endif", + "default", + "tristate", + "endchoice", + "def_tristate", + "def_bool", + "defconfig_list", + "on", + "optional", + "option", + "endmenu", + "mainmenu", + "menuconfig", + "modules", + "allnoconfig_y", + "menu", + "select", + "comment", + "env", + "range", + "choice", + "bool", + "source", + "visible", + "hex", + "config", + "boolean", + "string", + "help", + "prompt", + "depends" + }; +#define kconf_id_strings ((const char *) &kconf_id_strings_contents) +#ifdef __GNUC__ +__inline +#if defined __GNUC_STDC_INLINE__ || defined __GNUC_GNU_INLINE__ +__attribute__ ((__gnu_inline__)) +#endif +#endif +const struct kconf_id * +kconf_id_lookup (register const char *str, register unsigned int len) +{ + enum + { + TOTAL_KEYWORDS = 33, + MIN_WORD_LENGTH = 2, + MAX_WORD_LENGTH = 14, + MIN_HASH_VALUE = 2, + MAX_HASH_VALUE = 72 + }; + + static const struct kconf_id wordlist[] = + { + {-1}, {-1}, +#line 25 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str2, T_IF, TF_COMMAND|TF_PARAM}, +#line 36 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str3, T_TYPE, TF_COMMAND, S_INT}, + {-1}, +#line 26 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str5, T_ENDIF, TF_COMMAND}, + {-1}, +#line 29 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str7, T_DEFAULT, TF_COMMAND, S_UNKNOWN}, +#line 31 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str8, T_TYPE, TF_COMMAND, S_TRISTATE}, +#line 20 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str9, T_ENDCHOICE, TF_COMMAND}, + {-1}, {-1}, +#line 32 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str12, T_DEFAULT, TF_COMMAND, S_TRISTATE}, +#line 35 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str13, T_DEFAULT, TF_COMMAND, S_BOOLEAN}, +#line 45 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str14, T_OPT_DEFCONFIG_LIST,TF_OPTION}, + {-1}, {-1}, +#line 43 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str17, T_ON, TF_PARAM}, +#line 28 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str18, T_OPTIONAL, TF_COMMAND}, + {-1}, {-1}, +#line 42 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str21, T_OPTION, TF_COMMAND}, +#line 17 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str22, T_ENDMENU, TF_COMMAND}, +#line 15 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str23, T_MAINMENU, TF_COMMAND}, + {-1}, +#line 23 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str25, T_MENUCONFIG, TF_COMMAND}, + {-1}, +#line 44 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str27, T_OPT_MODULES, TF_OPTION}, +#line 47 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str28, T_OPT_ALLNOCONFIG_Y,TF_OPTION}, +#line 16 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str29, T_MENU, TF_COMMAND}, + {-1}, +#line 39 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str31, T_SELECT, TF_COMMAND}, +#line 21 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str32, T_COMMENT, TF_COMMAND}, +#line 46 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str33, T_OPT_ENV, TF_OPTION}, + {-1}, +#line 40 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str35, T_RANGE, TF_COMMAND}, +#line 19 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str36, T_CHOICE, TF_COMMAND}, + {-1}, {-1}, +#line 33 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str39, T_TYPE, TF_COMMAND, S_BOOLEAN}, + {-1}, +#line 18 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str41, T_SOURCE, TF_COMMAND}, +#line 41 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str42, T_VISIBLE, TF_COMMAND}, +#line 37 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str43, T_TYPE, TF_COMMAND, S_HEX}, + {-1}, {-1}, +#line 22 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str46, T_CONFIG, TF_COMMAND}, +#line 34 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str47, T_TYPE, TF_COMMAND, S_BOOLEAN}, + {-1}, {-1}, {-1}, +#line 38 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str51, T_TYPE, TF_COMMAND, S_STRING}, + {-1}, {-1}, +#line 24 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str54, T_HELP, TF_COMMAND}, + {-1}, +#line 30 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str56, T_PROMPT, TF_COMMAND}, + {-1}, {-1}, {-1}, {-1}, {-1}, {-1}, {-1}, {-1}, {-1}, + {-1}, {-1}, {-1}, {-1}, {-1}, {-1}, +#line 27 "scripts/kconfig/zconf.gperf" + {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str72, T_DEPENDS, TF_COMMAND} + }; + + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) + { + register int key = kconf_id_hash (str, len); + + if (key <= MAX_HASH_VALUE && key >= 0) + { + register int o = wordlist[key].name; + if (o >= 0) + { + register const char *s = o + kconf_id_strings; + + if (*str == *s && !strncmp (str + 1, s + 1, len - 1) && s[len] == '\0') + return &wordlist[key]; + } + } + } + return 0; +} +#line 48 "scripts/kconfig/zconf.gperf" + diff -Nur linux-4.2/scripts/kconfig/zconf.lex.c linux-4.2-pck/scripts/kconfig/zconf.lex.c --- linux-4.2/scripts/kconfig/zconf.lex.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/zconf.lex.c 2015-09-08 01:26:14.842469201 -0300 @@ -0,0 +1,2476 @@ + +#line 3 "scripts/kconfig/zconf.lex.c_shipped" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define yy_create_buffer zconf_create_buffer +#define yy_delete_buffer zconf_delete_buffer +#define yy_flex_debug zconf_flex_debug +#define yy_init_buffer zconf_init_buffer +#define yy_flush_buffer zconf_flush_buffer +#define yy_load_buffer_state zconf_load_buffer_state +#define yy_switch_to_buffer zconf_switch_to_buffer +#define yyin zconfin +#define yyleng zconfleng +#define yylex zconflex +#define yylineno zconflineno +#define yyout zconfout +#define yyrestart zconfrestart +#define yytext zconftext +#define yywrap zconfwrap +#define yyalloc zconfalloc +#define yyrealloc zconfrealloc +#define yyfree zconffree + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 35 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include +#include +#include +#include + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have . Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; +#endif /* ! C99 */ + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN (yy_start) = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START (((yy_start) - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE zconfrestart(zconfin ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#define YY_BUF_SIZE 16384 +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +extern int zconfleng; + +extern FILE *zconfin, *zconfout; + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up zconftext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = (yy_hold_char); \ + YY_RESTORE_YY_MORE_OFFSET \ + (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up zconftext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, (yytext_ptr) ) + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via zconfrestart()), so that the user can continue scanning by + * just pointing zconfin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* Stack of input buffers. */ +static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */ +static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */ +static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ + ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] + +/* yy_hold_char holds the character lost when zconftext is formed. */ +static char yy_hold_char; +static int yy_n_chars; /* number of characters read into yy_ch_buf */ +int zconfleng; + +/* Points to current character in buffer. */ +static char *yy_c_buf_p = (char *) 0; +static int yy_init = 0; /* whether we need to initialize */ +static int yy_start = 0; /* start state number */ + +/* Flag which is used to allow zconfwrap()'s to do buffer switches + * instead of setting up a fresh zconfin. A bit of a hack ... + */ +static int yy_did_buffer_switch_on_eof; + +void zconfrestart (FILE *input_file ); +void zconf_switch_to_buffer (YY_BUFFER_STATE new_buffer ); +YY_BUFFER_STATE zconf_create_buffer (FILE *file,int size ); +void zconf_delete_buffer (YY_BUFFER_STATE b ); +void zconf_flush_buffer (YY_BUFFER_STATE b ); +void zconfpush_buffer_state (YY_BUFFER_STATE new_buffer ); +void zconfpop_buffer_state (void ); + +static void zconfensure_buffer_stack (void ); +static void zconf_load_buffer_state (void ); +static void zconf_init_buffer (YY_BUFFER_STATE b,FILE *file ); + +#define YY_FLUSH_BUFFER zconf_flush_buffer(YY_CURRENT_BUFFER ) + +YY_BUFFER_STATE zconf_scan_buffer (char *base,yy_size_t size ); +YY_BUFFER_STATE zconf_scan_string (yyconst char *yy_str ); +YY_BUFFER_STATE zconf_scan_bytes (yyconst char *bytes,int len ); + +void *zconfalloc (yy_size_t ); +void *zconfrealloc (void *,yy_size_t ); +void zconffree (void * ); + +#define yy_new_buffer zconf_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + zconfensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + zconf_create_buffer(zconfin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + zconfensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + zconf_create_buffer(zconfin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +#define zconfwrap(n) 1 +#define YY_SKIP_YYWRAP + +typedef unsigned char YY_CHAR; + +FILE *zconfin = (FILE *) 0, *zconfout = (FILE *) 0; + +typedef int yy_state_type; + +extern int zconflineno; + +int zconflineno = 1; + +extern char *zconftext; +#define yytext_ptr zconftext +static yyconst flex_int16_t yy_nxt[][19] = + { + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0 + }, + + { + 11, 12, 13, 14, 12, 12, 15, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12 + }, + + { + 11, 12, 13, 14, 12, 12, 15, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12 + }, + + { + 11, 16, 16, 17, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 18, 16, 16, 16, 16, 16 + }, + + { + 11, 16, 16, 17, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 18, 16, 16, 16, 16, 16 + + }, + + { + 11, 19, 20, 21, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19 + }, + + { + 11, 19, 20, 21, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19 + }, + + { + 11, 22, 22, 23, 22, 24, 22, 22, 24, 22, + 22, 22, 22, 22, 22, 22, 22, 25, 22 + }, + + { + 11, 22, 22, 23, 22, 24, 22, 22, 24, 22, + 22, 22, 22, 22, 22, 22, 22, 25, 22 + }, + + { + 11, 26, 27, 28, 29, 30, 31, 32, 30, 33, + 34, 35, 36, 36, 37, 38, 39, 40, 41 + + }, + + { + 11, 26, 27, 28, 29, 30, 31, 32, 30, 33, + 34, 35, 36, 36, 37, 38, 39, 40, 41 + }, + + { + -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, + -11, -11, -11, -11, -11, -11, -11, -11, -11 + }, + + { + 11, -12, -12, -12, -12, -12, -12, -12, -12, -12, + -12, -12, -12, -12, -12, -12, -12, -12, -12 + }, + + { + 11, -13, 42, 43, -13, -13, 44, -13, -13, -13, + -13, -13, -13, -13, -13, -13, -13, -13, -13 + }, + + { + 11, -14, -14, -14, -14, -14, -14, -14, -14, -14, + -14, -14, -14, -14, -14, -14, -14, -14, -14 + + }, + + { + 11, 45, 45, 46, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45 + }, + + { + 11, -16, -16, -16, -16, -16, -16, -16, -16, -16, + -16, -16, -16, -16, -16, -16, -16, -16, -16 + }, + + { + 11, -17, -17, -17, -17, -17, -17, -17, -17, -17, + -17, -17, -17, -17, -17, -17, -17, -17, -17 + }, + + { + 11, -18, -18, -18, -18, -18, -18, -18, -18, -18, + -18, -18, -18, 47, -18, -18, -18, -18, -18 + }, + + { + 11, 48, 48, -19, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48 + + }, + + { + 11, -20, 49, 50, -20, -20, -20, -20, -20, -20, + -20, -20, -20, -20, -20, -20, -20, -20, -20 + }, + + { + 11, 51, -21, -21, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, 51 + }, + + { + 11, 52, 52, 53, 52, -22, 52, 52, -22, 52, + 52, 52, 52, 52, 52, 52, 52, -22, 52 + }, + + { + 11, -23, -23, -23, -23, -23, -23, -23, -23, -23, + -23, -23, -23, -23, -23, -23, -23, -23, -23 + }, + + { + 11, -24, -24, -24, -24, -24, -24, -24, -24, -24, + -24, -24, -24, -24, -24, -24, -24, -24, -24 + + }, + + { + 11, 54, 54, 55, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, 54 + }, + + { + 11, -26, -26, -26, -26, -26, -26, -26, -26, -26, + -26, -26, -26, -26, -26, -26, -26, -26, -26 + }, + + { + 11, -27, 56, -27, -27, -27, -27, -27, -27, -27, + -27, -27, -27, -27, -27, -27, -27, -27, -27 + }, + + { + 11, -28, -28, -28, -28, -28, -28, -28, -28, -28, + -28, -28, -28, -28, -28, -28, -28, -28, -28 + }, + + { + 11, -29, -29, -29, -29, -29, -29, -29, -29, -29, + -29, -29, -29, -29, -29, 57, -29, -29, -29 + + }, + + { + 11, -30, -30, -30, -30, -30, -30, -30, -30, -30, + -30, -30, -30, -30, -30, -30, -30, -30, -30 + }, + + { + 11, 58, 58, -31, 58, 58, 58, 58, 58, 58, + 58, 58, 58, 58, 58, 58, 58, 58, 58 + }, + + { + 11, -32, -32, -32, -32, -32, -32, 59, -32, -32, + -32, -32, -32, -32, -32, -32, -32, -32, -32 + }, + + { + 11, -33, -33, -33, -33, -33, -33, -33, -33, -33, + -33, -33, -33, -33, -33, -33, -33, -33, -33 + }, + + { + 11, -34, -34, -34, -34, -34, -34, -34, -34, -34, + -34, -34, -34, -34, -34, -34, -34, -34, -34 + + }, + + { + 11, -35, -35, -35, -35, -35, -35, -35, -35, -35, + -35, 60, 61, 61, -35, -35, -35, -35, -35 + }, + + { + 11, -36, -36, -36, -36, -36, -36, -36, -36, -36, + -36, 61, 61, 61, -36, -36, -36, -36, -36 + }, + + { + 11, -37, -37, -37, -37, -37, -37, -37, -37, -37, + -37, -37, -37, -37, -37, 62, -37, -37, -37 + }, + + { + 11, -38, -38, -38, -38, -38, -38, -38, -38, -38, + -38, -38, -38, -38, -38, -38, -38, -38, -38 + }, + + { + 11, -39, -39, -39, -39, -39, -39, -39, -39, -39, + -39, -39, -39, -39, -39, 63, -39, -39, -39 + + }, + + { + 11, -40, -40, 64, -40, -40, -40, -40, -40, -40, + -40, -40, -40, -40, -40, -40, -40, -40, -40 + }, + + { + 11, -41, -41, -41, -41, -41, -41, -41, -41, -41, + -41, -41, -41, -41, -41, -41, -41, -41, 65 + }, + + { + 11, -42, 42, 43, -42, -42, 44, -42, -42, -42, + -42, -42, -42, -42, -42, -42, -42, -42, -42 + }, + + { + 11, -43, -43, -43, -43, -43, -43, -43, -43, -43, + -43, -43, -43, -43, -43, -43, -43, -43, -43 + }, + + { + 11, 45, 45, 46, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45 + + }, + + { + 11, 45, 45, 46, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45 + }, + + { + 11, -46, -46, -46, -46, -46, -46, -46, -46, -46, + -46, -46, -46, -46, -46, -46, -46, -46, -46 + }, + + { + 11, -47, -47, -47, -47, -47, -47, -47, -47, -47, + -47, -47, -47, 47, -47, -47, -47, -47, -47 + }, + + { + 11, 48, 48, -48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48 + }, + + { + 11, -49, 49, 50, -49, -49, -49, -49, -49, -49, + -49, -49, -49, -49, -49, -49, -49, -49, -49 + + }, + + { + 11, 51, -50, -50, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, 51 + }, + + { + 11, -51, -51, -51, -51, -51, -51, -51, -51, -51, + -51, -51, -51, -51, -51, -51, -51, -51, -51 + }, + + { + 11, 52, 52, 53, 52, -52, 52, 52, -52, 52, + 52, 52, 52, 52, 52, 52, 52, -52, 52 + }, + + { + 11, -53, -53, -53, -53, -53, -53, -53, -53, -53, + -53, -53, -53, -53, -53, -53, -53, -53, -53 + }, + + { + 11, -54, -54, 55, -54, -54, -54, -54, -54, -54, + -54, -54, -54, -54, -54, -54, -54, -54, -54 + + }, + + { + 11, -55, -55, -55, -55, -55, -55, -55, -55, -55, + -55, -55, -55, -55, -55, -55, -55, -55, -55 + }, + + { + 11, -56, 56, -56, -56, -56, -56, -56, -56, -56, + -56, -56, -56, -56, -56, -56, -56, -56, -56 + }, + + { + 11, -57, -57, -57, -57, -57, -57, -57, -57, -57, + -57, -57, -57, -57, -57, -57, -57, -57, -57 + }, + + { + 11, 58, 58, -58, 58, 58, 58, 58, 58, 58, + 58, 58, 58, 58, 58, 58, 58, 58, 58 + }, + + { + 11, -59, -59, -59, -59, -59, -59, -59, -59, -59, + -59, -59, -59, -59, -59, -59, -59, -59, -59 + + }, + + { + 11, -60, -60, -60, -60, -60, -60, -60, -60, -60, + -60, 66, 61, 61, -60, -60, -60, -60, -60 + }, + + { + 11, -61, -61, -61, -61, -61, -61, -61, -61, -61, + -61, 61, 61, 61, -61, -61, -61, -61, -61 + }, + + { + 11, -62, -62, -62, -62, -62, -62, -62, -62, -62, + -62, -62, -62, -62, -62, -62, -62, -62, -62 + }, + + { + 11, -63, -63, -63, -63, -63, -63, -63, -63, -63, + -63, -63, -63, -63, -63, -63, -63, -63, -63 + }, + + { + 11, -64, -64, -64, -64, -64, -64, -64, -64, -64, + -64, -64, -64, -64, -64, -64, -64, -64, -64 + + }, + + { + 11, -65, -65, -65, -65, -65, -65, -65, -65, -65, + -65, -65, -65, -65, -65, -65, -65, -65, -65 + }, + + { + 11, -66, -66, -66, -66, -66, -66, -66, -66, -66, + -66, 61, 61, 61, -66, -66, -66, -66, -66 + }, + + } ; + +static yy_state_type yy_get_previous_state (void ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ); +static int yy_get_next_buffer (void ); +static void yy_fatal_error (yyconst char msg[] ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up zconftext. + */ +#define YY_DO_BEFORE_ACTION \ + (yytext_ptr) = yy_bp; \ + zconfleng = (size_t) (yy_cp - yy_bp); \ + (yy_hold_char) = *yy_cp; \ + *yy_cp = '\0'; \ + (yy_c_buf_p) = yy_cp; + +#define YY_NUM_RULES 38 +#define YY_END_OF_BUFFER 39 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[67] = + { 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 39, 5, 4, 2, 3, 7, 8, 6, 37, 34, + 36, 29, 33, 32, 31, 27, 26, 21, 13, 20, + 24, 27, 11, 12, 23, 23, 18, 14, 19, 27, + 27, 4, 2, 3, 3, 1, 6, 37, 34, 36, + 35, 29, 28, 31, 30, 26, 15, 24, 9, 23, + 23, 16, 17, 25, 10, 22 + } ; + +static yyconst flex_int32_t yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 4, 5, 6, 1, 1, 7, 8, 9, + 10, 1, 1, 1, 11, 12, 12, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 1, 1, 14, + 15, 16, 1, 1, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 1, 17, 1, 1, 13, 1, 13, 13, 13, 13, + + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 1, 18, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +extern int zconf_flex_debug; +int zconf_flex_debug = 0; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +char *zconftext; +#define YY_NO_INPUT 1 + +/* + * Copyright (C) 2002 Roman Zippel + * Released under the terms of the GNU GPL v2.0. + */ + +#include +#include +#include +#include +#include + +#include "lkc.h" + +#define START_STRSIZE 16 + +static struct { + struct file *file; + int lineno; +} current_pos; + +static char *text; +static int text_size, text_asize; + +struct buffer { + struct buffer *parent; + YY_BUFFER_STATE state; +}; + +struct buffer *current_buf; + +static int last_ts, first_ts; + +static void zconf_endhelp(void); +static void zconf_endfile(void); + +static void new_string(void) +{ + text = xmalloc(START_STRSIZE); + text_asize = START_STRSIZE; + text_size = 0; + *text = 0; +} + +static void append_string(const char *str, int size) +{ + int new_size = text_size + size + 1; + if (new_size > text_asize) { + new_size += START_STRSIZE - 1; + new_size &= -START_STRSIZE; + text = realloc(text, new_size); + text_asize = new_size; + } + memcpy(text + text_size, str, size); + text_size += size; + text[text_size] = 0; +} + +static void alloc_string(const char *str, int size) +{ + text = xmalloc(size + 1); + memcpy(text, str, size); + text[size] = 0; +} + +#define INITIAL 0 +#define COMMAND 1 +#define HELP 2 +#define STRING 3 +#define PARAM 4 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +static int yy_init_globals (void ); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int zconflex_destroy (void ); + +int zconfget_debug (void ); + +void zconfset_debug (int debug_flag ); + +YY_EXTRA_TYPE zconfget_extra (void ); + +void zconfset_extra (YY_EXTRA_TYPE user_defined ); + +FILE *zconfget_in (void ); + +void zconfset_in (FILE * in_str ); + +FILE *zconfget_out (void ); + +void zconfset_out (FILE * out_str ); + +int zconfget_leng (void ); + +char *zconfget_text (void ); + +int zconfget_lineno (void ); + +void zconfset_lineno (int line_number ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int zconfwrap (void ); +#else +extern int zconfwrap (void ); +#endif +#endif + + static void yyunput (int c,char *buf_ptr ); + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (void ); +#else +static int input (void ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#define YY_READ_BUF_SIZE 8192 +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO fwrite( zconftext, zconfleng, 1, zconfout ) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + errno=0; \ + while ( (result = read( fileno(zconfin), (char *) buf, max_size )) < 0 ) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(zconfin); \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int zconflex (void); + +#define YY_DECL int zconflex (void) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after zconftext and zconfleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK break; +#endif + +#define YY_RULE_SETUP \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + register yy_state_type yy_current_state; + register char *yy_cp, *yy_bp; + register int yy_act; + + int str = 0; + int ts, i; + + if ( !(yy_init) ) + { + (yy_init) = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! (yy_start) ) + (yy_start) = 1; /* first start state */ + + if ( ! zconfin ) + zconfin = stdin; + + if ( ! zconfout ) + zconfout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + zconfensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + zconf_create_buffer(zconfin,YY_BUF_SIZE ); + } + + zconf_load_buffer_state( ); + } + + while ( 1 ) /* loops until end-of-file is reached */ + { + yy_cp = (yy_c_buf_p); + + /* Support of zconftext. */ + *yy_cp = (yy_hold_char); + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = (yy_start); +yy_match: + while ( (yy_current_state = yy_nxt[yy_current_state][ yy_ec[YY_SC_TO_UI(*yy_cp)] ]) > 0 ) + ++yy_cp; + + yy_current_state = -yy_current_state; + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ +case 1: +/* rule 1 can match eol */ +case 2: +/* rule 2 can match eol */ +YY_RULE_SETUP +{ + current_file->lineno++; + return T_EOL; +} + YY_BREAK +case 3: +YY_RULE_SETUP + + YY_BREAK +case 4: +YY_RULE_SETUP +{ + BEGIN(COMMAND); +} + YY_BREAK +case 5: +YY_RULE_SETUP +{ + unput(zconftext[0]); + BEGIN(COMMAND); +} + YY_BREAK + +case 6: +YY_RULE_SETUP +{ + const struct kconf_id *id = kconf_id_lookup(zconftext, zconfleng); + BEGIN(PARAM); + current_pos.file = current_file; + current_pos.lineno = current_file->lineno; + if (id && id->flags & TF_COMMAND) { + zconflval.id = id; + return id->token; + } + alloc_string(zconftext, zconfleng); + zconflval.string = text; + return T_WORD; + } + YY_BREAK +case 7: +YY_RULE_SETUP + + YY_BREAK +case 8: +/* rule 8 can match eol */ +YY_RULE_SETUP +{ + BEGIN(INITIAL); + current_file->lineno++; + return T_EOL; + } + YY_BREAK + +case 9: +YY_RULE_SETUP +return T_AND; + YY_BREAK +case 10: +YY_RULE_SETUP +return T_OR; + YY_BREAK +case 11: +YY_RULE_SETUP +return T_OPEN_PAREN; + YY_BREAK +case 12: +YY_RULE_SETUP +return T_CLOSE_PAREN; + YY_BREAK +case 13: +YY_RULE_SETUP +return T_NOT; + YY_BREAK +case 14: +YY_RULE_SETUP +return T_EQUAL; + YY_BREAK +case 15: +YY_RULE_SETUP +return T_UNEQUAL; + YY_BREAK +case 16: +YY_RULE_SETUP +return T_LESS_EQUAL; + YY_BREAK +case 17: +YY_RULE_SETUP +return T_GREATER_EQUAL; + YY_BREAK +case 18: +YY_RULE_SETUP +return T_LESS; + YY_BREAK +case 19: +YY_RULE_SETUP +return T_GREATER; + YY_BREAK +case 20: +YY_RULE_SETUP +{ + str = zconftext[0]; + new_string(); + BEGIN(STRING); + } + YY_BREAK +case 21: +/* rule 21 can match eol */ +YY_RULE_SETUP +BEGIN(INITIAL); current_file->lineno++; return T_EOL; + YY_BREAK +case 22: +YY_RULE_SETUP +/* ignore */ + YY_BREAK +case 23: +YY_RULE_SETUP +{ + const struct kconf_id *id = kconf_id_lookup(zconftext, zconfleng); + if (id && id->flags & TF_PARAM) { + zconflval.id = id; + return id->token; + } + alloc_string(zconftext, zconfleng); + zconflval.string = text; + return T_WORD; + } + YY_BREAK +case 24: +YY_RULE_SETUP +/* comment */ + YY_BREAK +case 25: +/* rule 25 can match eol */ +YY_RULE_SETUP +current_file->lineno++; + YY_BREAK +case 26: +YY_RULE_SETUP + + YY_BREAK +case 27: +YY_RULE_SETUP +{ + fprintf(stderr, + "%s:%d:warning: ignoring unsupported character '%c'\n", + zconf_curname(), zconf_lineno(), *zconftext); + } + YY_BREAK +case YY_STATE_EOF(PARAM): +{ + BEGIN(INITIAL); + } + YY_BREAK + +case 28: +/* rule 28 can match eol */ +*yy_cp = (yy_hold_char); /* undo effects of setting up zconftext */ +(yy_c_buf_p) = yy_cp -= 1; +YY_DO_BEFORE_ACTION; /* set up zconftext again */ +YY_RULE_SETUP +{ + append_string(zconftext, zconfleng); + zconflval.string = text; + return T_WORD_QUOTE; + } + YY_BREAK +case 29: +YY_RULE_SETUP +{ + append_string(zconftext, zconfleng); + } + YY_BREAK +case 30: +/* rule 30 can match eol */ +*yy_cp = (yy_hold_char); /* undo effects of setting up zconftext */ +(yy_c_buf_p) = yy_cp -= 1; +YY_DO_BEFORE_ACTION; /* set up zconftext again */ +YY_RULE_SETUP +{ + append_string(zconftext + 1, zconfleng - 1); + zconflval.string = text; + return T_WORD_QUOTE; + } + YY_BREAK +case 31: +YY_RULE_SETUP +{ + append_string(zconftext + 1, zconfleng - 1); + } + YY_BREAK +case 32: +YY_RULE_SETUP +{ + if (str == zconftext[0]) { + BEGIN(PARAM); + zconflval.string = text; + return T_WORD_QUOTE; + } else + append_string(zconftext, 1); + } + YY_BREAK +case 33: +/* rule 33 can match eol */ +YY_RULE_SETUP +{ + printf("%s:%d:warning: multi-line strings not supported\n", zconf_curname(), zconf_lineno()); + current_file->lineno++; + BEGIN(INITIAL); + return T_EOL; + } + YY_BREAK +case YY_STATE_EOF(STRING): +{ + BEGIN(INITIAL); + } + YY_BREAK + +case 34: +YY_RULE_SETUP +{ + ts = 0; + for (i = 0; i < zconfleng; i++) { + if (zconftext[i] == '\t') + ts = (ts & ~7) + 8; + else + ts++; + } + last_ts = ts; + if (first_ts) { + if (ts < first_ts) { + zconf_endhelp(); + return T_HELPTEXT; + } + ts -= first_ts; + while (ts > 8) { + append_string(" ", 8); + ts -= 8; + } + append_string(" ", ts); + } + } + YY_BREAK +case 35: +/* rule 35 can match eol */ +*yy_cp = (yy_hold_char); /* undo effects of setting up zconftext */ +(yy_c_buf_p) = yy_cp -= 1; +YY_DO_BEFORE_ACTION; /* set up zconftext again */ +YY_RULE_SETUP +{ + current_file->lineno++; + zconf_endhelp(); + return T_HELPTEXT; + } + YY_BREAK +case 36: +/* rule 36 can match eol */ +YY_RULE_SETUP +{ + current_file->lineno++; + append_string("\n", 1); + } + YY_BREAK +case 37: +YY_RULE_SETUP +{ + while (zconfleng) { + if ((zconftext[zconfleng-1] != ' ') && (zconftext[zconfleng-1] != '\t')) + break; + zconfleng--; + } + append_string(zconftext, zconfleng); + if (!first_ts) + first_ts = last_ts; + } + YY_BREAK +case YY_STATE_EOF(HELP): +{ + zconf_endhelp(); + return T_HELPTEXT; + } + YY_BREAK + +case YY_STATE_EOF(INITIAL): +case YY_STATE_EOF(COMMAND): +{ + if (current_file) { + zconf_endfile(); + return T_EOL; + } + fclose(zconfin); + yyterminate(); +} + YY_BREAK +case 38: +YY_RULE_SETUP +YY_FATAL_ERROR( "flex scanner jammed" ); + YY_BREAK + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = (yy_hold_char); + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed zconfin at a new source and called + * zconflex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = zconfin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state ); + + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++(yy_c_buf_p); + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = (yy_c_buf_p); + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_END_OF_FILE: + { + (yy_did_buffer_switch_on_eof) = 0; + + if ( zconfwrap( ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * zconftext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = + (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + (yy_c_buf_p) = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ +} /* end of zconflex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (void) +{ + register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + register char *source = (yytext_ptr); + register int number_to_move, i; + int ret_val; + + if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1; + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; + + else + { + int num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER; + + int yy_c_buf_p_offset = + (int) ((yy_c_buf_p) - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + zconfrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = 0; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + (yy_n_chars), (size_t) num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + if ( (yy_n_chars) == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + zconfrestart(zconfin ); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if ((yy_size_t) ((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + yy_size_t new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) zconfrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + } + + (yy_n_chars) += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; + + (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (void) +{ + register yy_state_type yy_current_state; + register char *yy_cp; + + yy_current_state = (yy_start); + + for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) + { + yy_current_state = yy_nxt[yy_current_state][(*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1)]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) +{ + register int yy_is_jam; + + yy_current_state = yy_nxt[yy_current_state][1]; + yy_is_jam = (yy_current_state <= 0); + + return yy_is_jam ? 0 : yy_current_state; +} + + static void yyunput (int c, register char * yy_bp ) +{ + register char *yy_cp; + + yy_cp = (yy_c_buf_p); + + /* undo effects of setting up zconftext */ + *yy_cp = (yy_hold_char); + + if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) + { /* need to shift things up to make room */ + /* +2 for EOB chars. */ + register int number_to_move = (yy_n_chars) + 2; + register char *dest = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[ + YY_CURRENT_BUFFER_LVALUE->yy_buf_size + 2]; + register char *source = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]; + + while ( source > YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + *--dest = *--source; + + yy_cp += (int) (dest - source); + yy_bp += (int) (dest - source); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_buf_size; + + if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) + YY_FATAL_ERROR( "flex scanner push-back overflow" ); + } + + *--yy_cp = (char) c; + + (yytext_ptr) = yy_bp; + (yy_hold_char) = *yy_cp; + (yy_c_buf_p) = yy_cp; +} + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (void) +#else + static int input (void) +#endif + +{ + int c; + + *(yy_c_buf_p) = (yy_hold_char); + + if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + /* This was really a NUL. */ + *(yy_c_buf_p) = '\0'; + + else + { /* need more input */ + int offset = (yy_c_buf_p) - (yytext_ptr); + ++(yy_c_buf_p); + + switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + zconfrestart(zconfin ); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( zconfwrap( ) ) + return EOF; + + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(); +#else + return input(); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = (yytext_ptr) + offset; + break; + } + } + } + + c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ + *(yy_c_buf_p) = '\0'; /* preserve zconftext */ + (yy_hold_char) = *++(yy_c_buf_p); + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * + * @note This function does not reset the start condition to @c INITIAL . + */ + void zconfrestart (FILE * input_file ) +{ + + if ( ! YY_CURRENT_BUFFER ){ + zconfensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + zconf_create_buffer(zconfin,YY_BUF_SIZE ); + } + + zconf_init_buffer(YY_CURRENT_BUFFER,input_file ); + zconf_load_buffer_state( ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * + */ + void zconf_switch_to_buffer (YY_BUFFER_STATE new_buffer ) +{ + + /* TODO. We should be able to replace this entire function body + * with + * zconfpop_buffer_state(); + * zconfpush_buffer_state(new_buffer); + */ + zconfensure_buffer_stack (); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + zconf_load_buffer_state( ); + + /* We don't actually know whether we did this switch during + * EOF (zconfwrap()) processing, but the only time this flag + * is looked at is after zconfwrap() is called, so it's safe + * to go ahead and always set it. + */ + (yy_did_buffer_switch_on_eof) = 1; +} + +static void zconf_load_buffer_state (void) +{ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + zconfin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + (yy_hold_char) = *(yy_c_buf_p); +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * + * @return the allocated buffer state. + */ + YY_BUFFER_STATE zconf_create_buffer (FILE * file, int size ) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) zconfalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in zconf_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) zconfalloc(b->yy_buf_size + 2 ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in zconf_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + zconf_init_buffer(b,file ); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with zconf_create_buffer() + * + */ + void zconf_delete_buffer (YY_BUFFER_STATE b ) +{ + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + zconffree((void *) b->yy_ch_buf ); + + zconffree((void *) b ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a zconfrestart() or at EOF. + */ + static void zconf_init_buffer (YY_BUFFER_STATE b, FILE * file ) + +{ + int oerrno = errno; + + zconf_flush_buffer(b ); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then zconf_init_buffer was _probably_ + * called from zconfrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * + */ + void zconf_flush_buffer (YY_BUFFER_STATE b ) +{ + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + zconf_load_buffer_state( ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * + */ +void zconfpush_buffer_state (YY_BUFFER_STATE new_buffer ) +{ + if (new_buffer == NULL) + return; + + zconfensure_buffer_stack(); + + /* This block is copied from zconf_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + (yy_buffer_stack_top)++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from zconf_switch_to_buffer. */ + zconf_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * + */ +void zconfpop_buffer_state (void) +{ + if (!YY_CURRENT_BUFFER) + return; + + zconf_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + if ((yy_buffer_stack_top) > 0) + --(yy_buffer_stack_top); + + if (YY_CURRENT_BUFFER) { + zconf_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void zconfensure_buffer_stack (void) +{ + int num_to_alloc; + + if (!(yy_buffer_stack)) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; + (yy_buffer_stack) = (struct yy_buffer_state**)zconfalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in zconfensure_buffer_stack()" ); + + memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + (yy_buffer_stack_max) = num_to_alloc; + (yy_buffer_stack_top) = 0; + return; + } + + if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + int grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = (yy_buffer_stack_max) + grow_size; + (yy_buffer_stack) = (struct yy_buffer_state**)zconfrealloc + ((yy_buffer_stack), + num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in zconfensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); + (yy_buffer_stack_max) = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE zconf_scan_buffer (char * base, yy_size_t size ) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return 0; + + b = (YY_BUFFER_STATE) zconfalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in zconf_scan_buffer()" ); + + b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = 0; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + zconf_switch_to_buffer(b ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to zconflex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * zconf_scan_bytes() instead. + */ +YY_BUFFER_STATE zconf_scan_string (yyconst char * yystr ) +{ + + return zconf_scan_bytes(yystr,strlen(yystr) ); +} + +/** Setup the input buffer state to scan the given bytes. The next call to zconflex() will + * scan from a @e copy of @a bytes. + * @param bytes the byte buffer to scan + * @param len the number of bytes in the buffer pointed to by @a bytes. + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE zconf_scan_bytes (yyconst char * yybytes, int _yybytes_len ) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = _yybytes_len + 2; + buf = (char *) zconfalloc(n ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in zconf_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = zconf_scan_buffer(buf,n ); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in zconf_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yy_fatal_error (yyconst char* msg ) +{ + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up zconftext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + zconftext[zconfleng] = (yy_hold_char); \ + (yy_c_buf_p) = zconftext + yyless_macro_arg; \ + (yy_hold_char) = *(yy_c_buf_p); \ + *(yy_c_buf_p) = '\0'; \ + zconfleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the current line number. + * + */ +int zconfget_lineno (void) +{ + + return zconflineno; +} + +/** Get the input stream. + * + */ +FILE *zconfget_in (void) +{ + return zconfin; +} + +/** Get the output stream. + * + */ +FILE *zconfget_out (void) +{ + return zconfout; +} + +/** Get the length of the current token. + * + */ +int zconfget_leng (void) +{ + return zconfleng; +} + +/** Get the current token. + * + */ + +char *zconfget_text (void) +{ + return zconftext; +} + +/** Set the current line number. + * @param line_number + * + */ +void zconfset_lineno (int line_number ) +{ + + zconflineno = line_number; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param in_str A readable stream. + * + * @see zconf_switch_to_buffer + */ +void zconfset_in (FILE * in_str ) +{ + zconfin = in_str ; +} + +void zconfset_out (FILE * out_str ) +{ + zconfout = out_str ; +} + +int zconfget_debug (void) +{ + return zconf_flex_debug; +} + +void zconfset_debug (int bdebug ) +{ + zconf_flex_debug = bdebug ; +} + +static int yy_init_globals (void) +{ + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from zconflex_destroy(), so don't allocate here. + */ + + (yy_buffer_stack) = 0; + (yy_buffer_stack_top) = 0; + (yy_buffer_stack_max) = 0; + (yy_c_buf_p) = (char *) 0; + (yy_init) = 0; + (yy_start) = 0; + +/* Defined in main.c */ +#ifdef YY_STDINIT + zconfin = stdin; + zconfout = stdout; +#else + zconfin = (FILE *) 0; + zconfout = (FILE *) 0; +#endif + + /* For future reference: Set errno on error, since we are called by + * zconflex_init() + */ + return 0; +} + +/* zconflex_destroy is for both reentrant and non-reentrant scanners. */ +int zconflex_destroy (void) +{ + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + zconf_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + zconfpop_buffer_state(); + } + + /* Destroy the stack itself. */ + zconffree((yy_buffer_stack) ); + (yy_buffer_stack) = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * zconflex() is called, initialization will occur. */ + yy_init_globals( ); + + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ) +{ + register int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s ) +{ + register int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *zconfalloc (yy_size_t size ) +{ + return (void *) malloc( size ); +} + +void *zconfrealloc (void * ptr, yy_size_t size ) +{ + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return (void *) realloc( (char *) ptr, size ); +} + +void zconffree (void * ptr ) +{ + free( (char *) ptr ); /* see zconfrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +void zconf_starthelp(void) +{ + new_string(); + last_ts = first_ts = 0; + BEGIN(HELP); +} + +static void zconf_endhelp(void) +{ + zconflval.string = text; + BEGIN(INITIAL); +} + +/* + * Try to open specified file with following names: + * ./name + * $(srctree)/name + * The latter is used when srctree is separate from objtree + * when compiling the kernel. + * Return NULL if file is not found. + */ +FILE *zconf_fopen(const char *name) +{ + char *env, fullname[PATH_MAX+1]; + FILE *f; + + f = fopen(name, "r"); + if (!f && name != NULL && name[0] != '/') { + env = getenv(SRCTREE); + if (env) { + sprintf(fullname, "%s/%s", env, name); + f = fopen(fullname, "r"); + } + } + return f; +} + +void zconf_initscan(const char *name) +{ + zconfin = zconf_fopen(name); + if (!zconfin) { + printf("can't find file %s\n", name); + exit(1); + } + + current_buf = xmalloc(sizeof(*current_buf)); + memset(current_buf, 0, sizeof(*current_buf)); + + current_file = file_lookup(name); + current_file->lineno = 1; +} + +void zconf_nextfile(const char *name) +{ + struct file *iter; + struct file *file = file_lookup(name); + struct buffer *buf = xmalloc(sizeof(*buf)); + memset(buf, 0, sizeof(*buf)); + + current_buf->state = YY_CURRENT_BUFFER; + zconfin = zconf_fopen(file->name); + if (!zconfin) { + printf("%s:%d: can't open file \"%s\"\n", + zconf_curname(), zconf_lineno(), file->name); + exit(1); + } + zconf_switch_to_buffer(zconf_create_buffer(zconfin,YY_BUF_SIZE)); + buf->parent = current_buf; + current_buf = buf; + + for (iter = current_file->parent; iter; iter = iter->parent ) { + if (!strcmp(current_file->name,iter->name) ) { + printf("%s:%d: recursive inclusion detected. " + "Inclusion path:\n current file : '%s'\n", + zconf_curname(), zconf_lineno(), + zconf_curname()); + iter = current_file->parent; + while (iter && \ + strcmp(iter->name,current_file->name)) { + printf(" included from: '%s:%d'\n", + iter->name, iter->lineno-1); + iter = iter->parent; + } + if (iter) + printf(" included from: '%s:%d'\n", + iter->name, iter->lineno+1); + exit(1); + } + } + file->lineno = 1; + file->parent = current_file; + current_file = file; +} + +static void zconf_endfile(void) +{ + struct buffer *parent; + + current_file = current_file->parent; + + parent = current_buf->parent; + if (parent) { + fclose(zconfin); + zconf_delete_buffer(YY_CURRENT_BUFFER); + zconf_switch_to_buffer(parent->state); + } + free(current_buf); + current_buf = parent; +} + +int zconf_lineno(void) +{ + return current_pos.lineno; +} + +const char *zconf_curname(void) +{ + return current_pos.file ? current_pos.file->name : ""; +} + diff -Nur linux-4.2/scripts/kconfig/zconf.tab.c linux-4.2-pck/scripts/kconfig/zconf.tab.c --- linux-4.2/scripts/kconfig/zconf.tab.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/kconfig/zconf.tab.c 2015-09-08 01:26:14.839136029 -0300 @@ -0,0 +1,2580 @@ +/* A Bison parser, made by GNU Bison 2.5.1. */ + +/* Bison implementation for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2012 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* C LALR(1) parser skeleton written by Richard Stallman, by + simplifying the original so-called "semantic" parser. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output. */ +#define YYBISON 1 + +/* Bison version. */ +#define YYBISON_VERSION "2.5.1" + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 0 + +/* Push parsers. */ +#define YYPUSH 0 + +/* Pull parsers. */ +#define YYPULL 1 + +/* Using locations. */ +#define YYLSP_NEEDED 0 + +/* Substitute the variable and function names. */ +#define yyparse zconfparse +#define yylex zconflex +#define yyerror zconferror +#define yylval zconflval +#define yychar zconfchar +#define yydebug zconfdebug +#define yynerrs zconfnerrs + + +/* Copy the first part of user declarations. */ + + +/* + * Copyright (C) 2002 Roman Zippel + * Released under the terms of the GNU GPL v2.0. + */ + +#include +#include +#include +#include +#include +#include + +#include "lkc.h" + +#define printd(mask, fmt...) if (cdebug & (mask)) printf(fmt) + +#define PRINTD 0x0001 +#define DEBUG_PARSE 0x0002 + +int cdebug = PRINTD; + +extern int zconflex(void); +static void zconfprint(const char *err, ...); +static void zconf_error(const char *err, ...); +static void zconferror(const char *err); +static bool zconf_endtoken(const struct kconf_id *id, int starttoken, int endtoken); + +struct symbol *symbol_hash[SYMBOL_HASHSIZE]; + +static struct menu *current_menu, *current_entry; + + + + +# ifndef YY_NULL +# if defined __cplusplus && 201103L <= __cplusplus +# define YY_NULL nullptr +# else +# define YY_NULL 0 +# endif +# endif + +/* Enabling traces. */ +#ifndef YYDEBUG +# define YYDEBUG 1 +#endif + +/* Enabling verbose error messages. */ +#ifdef YYERROR_VERBOSE +# undef YYERROR_VERBOSE +# define YYERROR_VERBOSE 1 +#else +# define YYERROR_VERBOSE 0 +#endif + +/* Enabling the token table. */ +#ifndef YYTOKEN_TABLE +# define YYTOKEN_TABLE 0 +#endif + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + T_MAINMENU = 258, + T_MENU = 259, + T_ENDMENU = 260, + T_SOURCE = 261, + T_CHOICE = 262, + T_ENDCHOICE = 263, + T_COMMENT = 264, + T_CONFIG = 265, + T_MENUCONFIG = 266, + T_HELP = 267, + T_HELPTEXT = 268, + T_IF = 269, + T_ENDIF = 270, + T_DEPENDS = 271, + T_OPTIONAL = 272, + T_PROMPT = 273, + T_TYPE = 274, + T_DEFAULT = 275, + T_SELECT = 276, + T_RANGE = 277, + T_VISIBLE = 278, + T_OPTION = 279, + T_ON = 280, + T_WORD = 281, + T_WORD_QUOTE = 282, + T_UNEQUAL = 283, + T_LESS = 284, + T_LESS_EQUAL = 285, + T_GREATER = 286, + T_GREATER_EQUAL = 287, + T_CLOSE_PAREN = 288, + T_OPEN_PAREN = 289, + T_EOL = 290, + T_OR = 291, + T_AND = 292, + T_EQUAL = 293, + T_NOT = 294 + }; +#endif + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ + + + char *string; + struct file *file; + struct symbol *symbol; + struct expr *expr; + struct menu *menu; + const struct kconf_id *id; + + + +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + + +/* Copy the second part of user declarations. */ + + +/* Include zconf.hash.c here so it can see the token constants. */ +#include "zconf.hash.c" + + + +#ifdef short +# undef short +#endif + +#ifdef YYTYPE_UINT8 +typedef YYTYPE_UINT8 yytype_uint8; +#else +typedef unsigned char yytype_uint8; +#endif + +#ifdef YYTYPE_INT8 +typedef YYTYPE_INT8 yytype_int8; +#elif (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +typedef signed char yytype_int8; +#else +typedef short int yytype_int8; +#endif + +#ifdef YYTYPE_UINT16 +typedef YYTYPE_UINT16 yytype_uint16; +#else +typedef unsigned short int yytype_uint16; +#endif + +#ifdef YYTYPE_INT16 +typedef YYTYPE_INT16 yytype_int16; +#else +typedef short int yytype_int16; +#endif + +#ifndef YYSIZE_T +# ifdef __SIZE_TYPE__ +# define YYSIZE_T __SIZE_TYPE__ +# elif defined size_t +# define YYSIZE_T size_t +# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# else +# define YYSIZE_T unsigned int +# endif +#endif + +#define YYSIZE_MAXIMUM ((YYSIZE_T) -1) + +#ifndef YY_ +# if defined YYENABLE_NLS && YYENABLE_NLS +# if ENABLE_NLS +# include /* INFRINGES ON USER NAME SPACE */ +# define YY_(msgid) dgettext ("bison-runtime", msgid) +# endif +# endif +# ifndef YY_ +# define YY_(msgid) msgid +# endif +#endif + +/* Suppress unused-variable warnings by "using" E. */ +#if ! defined lint || defined __GNUC__ +# define YYUSE(e) ((void) (e)) +#else +# define YYUSE(e) /* empty */ +#endif + +/* Identity function, used to suppress warnings about constant conditions. */ +#ifndef lint +# define YYID(n) (n) +#else +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static int +YYID (int yyi) +#else +static int +YYID (yyi) + int yyi; +#endif +{ + return yyi; +} +#endif + +#if ! defined yyoverflow || YYERROR_VERBOSE + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# elif defined __BUILTIN_VA_ARG_INCR +# include /* INFRINGES ON USER NAME SPACE */ +# elif defined _AIX +# define YYSTACK_ALLOC __alloca +# elif defined _MSC_VER +# include /* INFRINGES ON USER NAME SPACE */ +# define alloca _alloca +# else +# define YYSTACK_ALLOC alloca +# if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include /* INFRINGES ON USER NAME SPACE */ + /* Use EXIT_SUCCESS as a witness for stdlib.h. */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's `empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0)) +# ifndef YYSTACK_ALLOC_MAXIMUM + /* The OS might guarantee only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + invoke alloca (N) if N exceeds 4096. Use a slightly smaller number + to allow for a few compiler-allocated temporary stack slots. */ +# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */ +# endif +# else +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# ifndef YYSTACK_ALLOC_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# endif +# if (defined __cplusplus && ! defined EXIT_SUCCESS \ + && ! ((defined YYMALLOC || defined malloc) \ + && (defined YYFREE || defined free))) +# include /* INFRINGES ON USER NAME SPACE */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# if ! defined malloc && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# ifndef YYFREE +# define YYFREE free +# if ! defined free && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void free (void *); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# endif +#endif /* ! defined yyoverflow || YYERROR_VERBOSE */ + + +#if (! defined yyoverflow \ + && (! defined __cplusplus \ + || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + yytype_int16 yyss_alloc; + YYSTYPE yyvs_alloc; +}; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +# define YYCOPY_NEEDED 1 + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack_alloc, Stack) \ + do \ + { \ + YYSIZE_T yynewbytes; \ + YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \ + Stack = &yyptr->Stack_alloc; \ + yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / sizeof (*yyptr); \ + } \ + while (YYID (0)) + +#endif + +#if defined YYCOPY_NEEDED && YYCOPY_NEEDED +/* Copy COUNT objects from SRC to DST. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined __GNUC__ && 1 < __GNUC__ +# define YYCOPY(Dst, Src, Count) \ + __builtin_memcpy (Dst, Src, (Count) * sizeof (*(Src))) +# else +# define YYCOPY(Dst, Src, Count) \ + do \ + { \ + YYSIZE_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (Dst)[yyi] = (Src)[yyi]; \ + } \ + while (YYID (0)) +# endif +# endif +#endif /* !YYCOPY_NEEDED */ + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 11 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 298 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 40 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 50 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 122 +/* YYNRULES -- Number of states. */ +#define YYNSTATES 199 + +/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */ +#define YYUNDEFTOK 2 +#define YYMAXUTOK 294 + +#define YYTRANSLATE(YYX) \ + ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK) + +/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX. */ +static const yytype_uint8 yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39 +}; + +#if YYDEBUG +/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in + YYRHS. */ +static const yytype_uint16 yyprhs[] = +{ + 0, 0, 3, 6, 8, 11, 13, 14, 17, 20, + 23, 26, 31, 36, 40, 42, 44, 46, 48, 50, + 52, 54, 56, 58, 60, 62, 64, 66, 68, 72, + 75, 79, 82, 86, 89, 90, 93, 96, 99, 102, + 105, 108, 112, 117, 122, 127, 133, 137, 138, 142, + 143, 146, 150, 153, 155, 159, 160, 163, 166, 169, + 172, 175, 180, 184, 187, 192, 193, 196, 200, 202, + 206, 207, 210, 213, 216, 220, 224, 228, 230, 234, + 235, 238, 241, 244, 248, 252, 255, 258, 261, 262, + 265, 268, 271, 276, 277, 280, 283, 286, 287, 290, + 292, 294, 297, 300, 303, 305, 308, 309, 312, 314, + 318, 322, 326, 330, 334, 338, 342, 345, 349, 353, + 355, 357, 358 +}; + +/* YYRHS -- A `-1'-separated list of the rules' RHS. */ +static const yytype_int8 yyrhs[] = +{ + 41, 0, -1, 85, 42, -1, 42, -1, 67, 43, + -1, 43, -1, -1, 43, 45, -1, 43, 59, -1, + 43, 71, -1, 43, 84, -1, 43, 26, 1, 35, + -1, 43, 44, 1, 35, -1, 43, 1, 35, -1, + 16, -1, 18, -1, 19, -1, 21, -1, 17, -1, + 22, -1, 20, -1, 23, -1, 35, -1, 65, -1, + 75, -1, 48, -1, 50, -1, 73, -1, 26, 1, + 35, -1, 1, 35, -1, 10, 26, 35, -1, 47, + 51, -1, 11, 26, 35, -1, 49, 51, -1, -1, + 51, 52, -1, 51, 53, -1, 51, 79, -1, 51, + 77, -1, 51, 46, -1, 51, 35, -1, 19, 82, + 35, -1, 18, 83, 86, 35, -1, 20, 87, 86, + 35, -1, 21, 26, 86, 35, -1, 22, 88, 88, + 86, 35, -1, 24, 54, 35, -1, -1, 54, 26, + 55, -1, -1, 38, 83, -1, 7, 89, 35, -1, + 56, 60, -1, 84, -1, 57, 62, 58, -1, -1, + 60, 61, -1, 60, 79, -1, 60, 77, -1, 60, + 35, -1, 60, 46, -1, 18, 83, 86, 35, -1, + 19, 82, 35, -1, 17, 35, -1, 20, 26, 86, + 35, -1, -1, 62, 45, -1, 14, 87, 85, -1, + 84, -1, 63, 66, 64, -1, -1, 66, 45, -1, + 66, 71, -1, 66, 59, -1, 3, 83, 85, -1, + 4, 83, 35, -1, 68, 80, 78, -1, 84, -1, + 69, 72, 70, -1, -1, 72, 45, -1, 72, 71, + -1, 72, 59, -1, 6, 83, 35, -1, 9, 83, + 35, -1, 74, 78, -1, 12, 35, -1, 76, 13, + -1, -1, 78, 79, -1, 78, 35, -1, 78, 46, + -1, 16, 25, 87, 35, -1, -1, 80, 81, -1, + 80, 35, -1, 23, 86, -1, -1, 83, 86, -1, + 26, -1, 27, -1, 5, 35, -1, 8, 35, -1, + 15, 35, -1, 35, -1, 85, 35, -1, -1, 14, + 87, -1, 88, -1, 88, 29, 88, -1, 88, 30, + 88, -1, 88, 31, 88, -1, 88, 32, 88, -1, + 88, 38, 88, -1, 88, 28, 88, -1, 34, 87, + 33, -1, 39, 87, -1, 87, 36, 87, -1, 87, + 37, 87, -1, 26, -1, 27, -1, -1, 26, -1 +}; + +/* YYRLINE[YYN] -- source line where rule number YYN was defined. */ +static const yytype_uint16 yyrline[] = +{ + 0, 108, 108, 108, 110, 110, 112, 114, 115, 116, + 117, 118, 119, 123, 127, 127, 127, 127, 127, 127, + 127, 127, 131, 132, 133, 134, 135, 136, 140, 141, + 147, 155, 161, 169, 179, 181, 182, 183, 184, 185, + 186, 189, 197, 203, 213, 219, 225, 228, 230, 241, + 242, 247, 256, 261, 269, 272, 274, 275, 276, 277, + 278, 281, 287, 298, 304, 314, 316, 321, 329, 337, + 340, 342, 343, 344, 349, 356, 363, 368, 376, 379, + 381, 382, 383, 386, 394, 401, 408, 414, 421, 423, + 424, 425, 428, 436, 438, 439, 442, 449, 451, 456, + 457, 460, 461, 462, 466, 467, 470, 471, 474, 475, + 476, 477, 478, 479, 480, 481, 482, 483, 484, 487, + 488, 491, 492 +}; +#endif + +#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE +/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "$end", "error", "$undefined", "T_MAINMENU", "T_MENU", "T_ENDMENU", + "T_SOURCE", "T_CHOICE", "T_ENDCHOICE", "T_COMMENT", "T_CONFIG", + "T_MENUCONFIG", "T_HELP", "T_HELPTEXT", "T_IF", "T_ENDIF", "T_DEPENDS", + "T_OPTIONAL", "T_PROMPT", "T_TYPE", "T_DEFAULT", "T_SELECT", "T_RANGE", + "T_VISIBLE", "T_OPTION", "T_ON", "T_WORD", "T_WORD_QUOTE", "T_UNEQUAL", + "T_LESS", "T_LESS_EQUAL", "T_GREATER", "T_GREATER_EQUAL", + "T_CLOSE_PAREN", "T_OPEN_PAREN", "T_EOL", "T_OR", "T_AND", "T_EQUAL", + "T_NOT", "$accept", "input", "start", "stmt_list", "option_name", + "common_stmt", "option_error", "config_entry_start", "config_stmt", + "menuconfig_entry_start", "menuconfig_stmt", "config_option_list", + "config_option", "symbol_option", "symbol_option_list", + "symbol_option_arg", "choice", "choice_entry", "choice_end", + "choice_stmt", "choice_option_list", "choice_option", "choice_block", + "if_entry", "if_end", "if_stmt", "if_block", "mainmenu_stmt", "menu", + "menu_entry", "menu_end", "menu_stmt", "menu_block", "source_stmt", + "comment", "comment_stmt", "help_start", "help", "depends_list", + "depends", "visibility_list", "visible", "prompt_stmt_opt", "prompt", + "end", "nl", "if_expr", "expr", "symbol", "word_opt", YY_NULL +}; +#endif + +# ifdef YYPRINT +/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to + token YYLEX-NUM. */ +static const yytype_uint16 yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 262, 263, 264, + 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, + 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294 +}; +# endif + +/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const yytype_uint8 yyr1[] = +{ + 0, 40, 41, 41, 42, 42, 43, 43, 43, 43, + 43, 43, 43, 43, 44, 44, 44, 44, 44, 44, + 44, 44, 45, 45, 45, 45, 45, 45, 46, 46, + 47, 48, 49, 50, 51, 51, 51, 51, 51, 51, + 51, 52, 52, 52, 52, 52, 53, 54, 54, 55, + 55, 56, 57, 58, 59, 60, 60, 60, 60, 60, + 60, 61, 61, 61, 61, 62, 62, 63, 64, 65, + 66, 66, 66, 66, 67, 68, 69, 70, 71, 72, + 72, 72, 72, 73, 74, 75, 76, 77, 78, 78, + 78, 78, 79, 80, 80, 80, 81, 82, 82, 83, + 83, 84, 84, 84, 85, 85, 86, 86, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, 87, 88, + 88, 89, 89 +}; + +/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */ +static const yytype_uint8 yyr2[] = +{ + 0, 2, 2, 1, 2, 1, 0, 2, 2, 2, + 2, 4, 4, 3, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, + 3, 2, 3, 2, 0, 2, 2, 2, 2, 2, + 2, 3, 4, 4, 4, 5, 3, 0, 3, 0, + 2, 3, 2, 1, 3, 0, 2, 2, 2, 2, + 2, 4, 3, 2, 4, 0, 2, 3, 1, 3, + 0, 2, 2, 2, 3, 3, 3, 1, 3, 0, + 2, 2, 2, 3, 3, 2, 2, 2, 0, 2, + 2, 2, 4, 0, 2, 2, 2, 0, 2, 1, + 1, 2, 2, 2, 1, 2, 0, 2, 1, 3, + 3, 3, 3, 3, 3, 3, 2, 3, 3, 1, + 1, 0, 1 +}; + +/* YYDEFACT[STATE-NAME] -- Default reduction number in state STATE-NUM. + Performed when YYTABLE doesn't specify something else to do. Zero + means the default is an error. */ +static const yytype_uint8 yydefact[] = +{ + 6, 0, 104, 0, 3, 0, 6, 6, 99, 100, + 0, 1, 0, 0, 0, 0, 121, 0, 0, 0, + 0, 0, 0, 14, 18, 15, 16, 20, 17, 19, + 21, 0, 22, 0, 7, 34, 25, 34, 26, 55, + 65, 8, 70, 23, 93, 79, 9, 27, 88, 24, + 10, 0, 105, 2, 74, 13, 0, 101, 0, 122, + 0, 102, 0, 0, 0, 119, 120, 0, 0, 0, + 108, 103, 0, 0, 0, 0, 0, 0, 0, 88, + 0, 0, 75, 83, 51, 84, 30, 32, 0, 116, + 0, 0, 67, 0, 0, 0, 0, 0, 0, 11, + 12, 0, 0, 0, 0, 97, 0, 0, 0, 47, + 0, 40, 39, 35, 36, 0, 38, 37, 0, 0, + 97, 0, 59, 60, 56, 58, 57, 66, 54, 53, + 71, 73, 69, 72, 68, 106, 95, 0, 94, 80, + 82, 78, 81, 77, 90, 91, 89, 115, 117, 118, + 114, 109, 110, 111, 112, 113, 29, 86, 0, 106, + 0, 106, 106, 106, 0, 0, 0, 87, 63, 106, + 0, 106, 0, 96, 0, 0, 41, 98, 0, 0, + 106, 49, 46, 28, 0, 62, 0, 107, 92, 42, + 43, 44, 0, 0, 48, 61, 64, 45, 50 +}; + +/* YYDEFGOTO[NTERM-NUM]. */ +static const yytype_int16 yydefgoto[] = +{ + -1, 3, 4, 5, 33, 34, 112, 35, 36, 37, + 38, 74, 113, 114, 165, 194, 39, 40, 128, 41, + 76, 124, 77, 42, 132, 43, 78, 6, 44, 45, + 141, 46, 80, 47, 48, 49, 115, 116, 81, 117, + 79, 138, 160, 161, 50, 7, 173, 69, 70, 60 +}; + +/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +#define YYPACT_NINF -91 +static const yytype_int16 yypact[] = +{ + 19, 37, -91, 13, -91, 79, -91, 20, -91, -91, + -16, -91, 21, 37, 25, 37, 41, 36, 37, 78, + 83, 31, 56, -91, -91, -91, -91, -91, -91, -91, + -91, 116, -91, 127, -91, -91, -91, -91, -91, -91, + -91, -91, -91, -91, -91, -91, -91, -91, -91, -91, + -91, 147, -91, -91, 105, -91, 109, -91, 111, -91, + 114, -91, 136, 137, 142, -91, -91, 31, 31, 76, + 254, -91, 143, 146, 27, 115, 207, 258, 243, -14, + 243, 179, -91, -91, -91, -91, -91, -91, -7, -91, + 31, 31, 105, 51, 51, 51, 51, 51, 51, -91, + -91, 156, 168, 181, 37, 37, 31, 178, 51, -91, + 206, -91, -91, -91, -91, 196, -91, -91, 175, 37, + 37, 185, -91, -91, -91, -91, -91, -91, -91, -91, + -91, -91, -91, -91, -91, 214, -91, 230, -91, -91, + -91, -91, -91, -91, -91, -91, -91, -91, 183, -91, + -91, -91, -91, -91, -91, -91, -91, -91, 31, 214, + 194, 214, 45, 214, 51, 26, 195, -91, -91, 214, + 197, 214, 31, -91, 139, 208, -91, -91, 220, 224, + 214, 222, -91, -91, 226, -91, 227, 123, -91, -91, + -91, -91, 235, 37, -91, -91, -91, -91, -91 +}; + +/* YYPGOTO[NTERM-NUM]. */ +static const yytype_int16 yypgoto[] = +{ + -91, -91, 264, 268, -91, 30, -65, -91, -91, -91, + -91, 238, -91, -91, -91, -91, -91, -91, -91, -12, + -91, -91, -91, -91, -91, -91, -91, -91, -91, -91, + -91, -5, -91, -91, -91, -91, -91, 200, 209, -61, + -91, -91, 170, -1, 65, 0, 118, -66, -90, -91 +}; + +/* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule which + number is the opposite. If YYTABLE_NINF, syntax error. */ +#define YYTABLE_NINF -86 +static const yytype_int16 yytable[] = +{ + 10, 88, 89, 150, 151, 152, 153, 154, 155, 135, + 54, 123, 56, 11, 58, 126, 145, 62, 164, 2, + 146, 136, 1, 1, 148, 149, 147, -31, 101, 90, + 91, -31, -31, -31, -31, -31, -31, -31, -31, 102, + 162, -31, -31, 103, -31, 104, 105, 106, 107, 108, + -31, 109, 181, 110, 2, 52, 55, 65, 66, 172, + 57, 182, 111, 8, 9, 67, 131, 59, 140, 92, + 68, 61, 145, 133, 180, 142, 146, 65, 66, -5, + 12, 90, 91, 13, 14, 15, 16, 17, 18, 19, + 20, 71, 174, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 159, 63, 31, 187, 127, 130, 64, + 139, 2, 90, 91, 32, -33, 101, 72, 169, -33, + -33, -33, -33, -33, -33, -33, -33, 102, 73, -33, + -33, 103, -33, 104, 105, 106, 107, 108, -33, 109, + 52, 110, 129, 134, 82, 143, 83, -4, 12, 84, + 111, 13, 14, 15, 16, 17, 18, 19, 20, 90, + 91, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 85, 86, 31, 188, 90, 91, 87, 99, -85, + 101, 100, 32, -85, -85, -85, -85, -85, -85, -85, + -85, 156, 198, -85, -85, 103, -85, -85, -85, -85, + -85, -85, -85, 157, 163, 110, 158, 166, 101, 167, + 168, 171, -52, -52, 144, -52, -52, -52, -52, 102, + 91, -52, -52, 103, 118, 119, 120, 121, 172, 176, + 183, 101, 185, 110, -76, -76, -76, -76, -76, -76, + -76, -76, 122, 189, -76, -76, 103, 13, 14, 15, + 16, 17, 18, 19, 20, 190, 110, 21, 22, 191, + 193, 195, 196, 14, 15, 144, 17, 18, 19, 20, + 197, 53, 21, 22, 51, 75, 125, 175, 32, 177, + 178, 179, 93, 94, 95, 96, 97, 184, 137, 186, + 170, 0, 98, 32, 0, 0, 0, 0, 192 +}; + +#define yypact_value_is_default(yystate) \ + ((yystate) == (-91)) + +#define yytable_value_is_error(yytable_value) \ + YYID (0) + +static const yytype_int16 yycheck[] = +{ + 1, 67, 68, 93, 94, 95, 96, 97, 98, 23, + 10, 76, 13, 0, 15, 76, 81, 18, 108, 35, + 81, 35, 3, 3, 90, 91, 33, 0, 1, 36, + 37, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 106, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 26, 26, 35, 35, 35, 26, 27, 14, + 35, 35, 35, 26, 27, 34, 78, 26, 80, 69, + 39, 35, 137, 78, 164, 80, 137, 26, 27, 0, + 1, 36, 37, 4, 5, 6, 7, 8, 9, 10, + 11, 35, 158, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 104, 26, 26, 172, 77, 78, 26, + 80, 35, 36, 37, 35, 0, 1, 1, 119, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 1, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 35, 26, 77, 78, 35, 80, 35, 0, 1, 35, + 35, 4, 5, 6, 7, 8, 9, 10, 11, 36, + 37, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 35, 35, 26, 35, 36, 37, 35, 35, 0, + 1, 35, 35, 4, 5, 6, 7, 8, 9, 10, + 11, 35, 193, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 35, 26, 26, 25, 1, 1, 13, + 35, 26, 5, 6, 35, 8, 9, 10, 11, 12, + 37, 14, 15, 16, 17, 18, 19, 20, 14, 35, + 35, 1, 35, 26, 4, 5, 6, 7, 8, 9, + 10, 11, 35, 35, 14, 15, 16, 4, 5, 6, + 7, 8, 9, 10, 11, 35, 26, 14, 15, 35, + 38, 35, 35, 5, 6, 35, 8, 9, 10, 11, + 35, 7, 14, 15, 6, 37, 76, 159, 35, 161, + 162, 163, 28, 29, 30, 31, 32, 169, 79, 171, + 120, -1, 38, 35, -1, -1, -1, -1, 180 +}; + +/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const yytype_uint8 yystos[] = +{ + 0, 3, 35, 41, 42, 43, 67, 85, 26, 27, + 83, 0, 1, 4, 5, 6, 7, 8, 9, 10, + 11, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 26, 35, 44, 45, 47, 48, 49, 50, 56, + 57, 59, 63, 65, 68, 69, 71, 73, 74, 75, + 84, 43, 35, 42, 85, 35, 83, 35, 83, 26, + 89, 35, 83, 26, 26, 26, 27, 34, 39, 87, + 88, 35, 1, 1, 51, 51, 60, 62, 66, 80, + 72, 78, 35, 35, 35, 35, 35, 35, 87, 87, + 36, 37, 85, 28, 29, 30, 31, 32, 38, 35, + 35, 1, 12, 16, 18, 19, 20, 21, 22, 24, + 26, 35, 46, 52, 53, 76, 77, 79, 17, 18, + 19, 20, 35, 46, 61, 77, 79, 45, 58, 84, + 45, 59, 64, 71, 84, 23, 35, 78, 81, 45, + 59, 70, 71, 84, 35, 46, 79, 33, 87, 87, + 88, 88, 88, 88, 88, 88, 35, 35, 25, 83, + 82, 83, 87, 26, 88, 54, 1, 13, 35, 83, + 82, 26, 14, 86, 87, 86, 35, 86, 86, 86, + 88, 26, 35, 35, 86, 35, 86, 87, 35, 35, + 35, 35, 86, 38, 55, 35, 35, 35, 83 +}; + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) +#define YYEMPTY (-2) +#define YYEOF 0 + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +/* Like YYERROR except do call yyerror. This remains here temporarily + to ease the transition to the new meaning of YYERROR, for GCC. + Once GCC version 2 has supplanted version 1, this can go. However, + YYFAIL appears to be in use. Nevertheless, it is formally deprecated + in Bison 2.4.2's NEWS entry, where a plan to phase it out is + discussed. */ + +#define YYFAIL goto yyerrlab +#if defined YYFAIL + /* This is here to suppress warnings from the GCC cpp's + -Wunused-macros. Normally we don't worry about that warning, but + some users do, and we want to make it easy for users to remove + YYFAIL uses, which will produce warnings from Bison 2.5. */ +#endif + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ +do \ + if (yychar == YYEMPTY) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + YYPOPSTACK (yylen); \ + yystate = *yyssp; \ + goto yybackup; \ + } \ + else \ + { \ + yyerror (YY_("syntax error: cannot back up")); \ + YYERROR; \ + } \ +while (YYID (0)) + + +#define YYTERROR 1 +#define YYERRCODE 256 + + +/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N]. + If N is 0, then set CURRENT to the empty location which ends + the previous symbol: RHS[0] (always defined). */ + +#define YYRHSLOC(Rhs, K) ((Rhs)[K]) +#ifndef YYLLOC_DEFAULT +# define YYLLOC_DEFAULT(Current, Rhs, N) \ + do \ + if (YYID (N)) \ + { \ + (Current).first_line = YYRHSLOC (Rhs, 1).first_line; \ + (Current).first_column = YYRHSLOC (Rhs, 1).first_column; \ + (Current).last_line = YYRHSLOC (Rhs, N).last_line; \ + (Current).last_column = YYRHSLOC (Rhs, N).last_column; \ + } \ + else \ + { \ + (Current).first_line = (Current).last_line = \ + YYRHSLOC (Rhs, 0).last_line; \ + (Current).first_column = (Current).last_column = \ + YYRHSLOC (Rhs, 0).last_column; \ + } \ + while (YYID (0)) +#endif + + +/* This macro is provided for backward compatibility. */ + +#ifndef YY_LOCATION_PRINT +# define YY_LOCATION_PRINT(File, Loc) ((void) 0) +#endif + + +/* YYLEX -- calling `yylex' with the right arguments. */ + +#ifdef YYLEX_PARAM +# define YYLEX yylex (YYLEX_PARAM) +#else +# define YYLEX yylex () +#endif + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (YYID (0)) + +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yy_symbol_print (stderr, \ + Type, Value); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (YYID (0)) + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep) +#else +static void +yy_symbol_value_print (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; +#endif +{ + FILE *yyo = yyoutput; + YYUSE (yyo); + if (!yyvaluep) + return; +# ifdef YYPRINT + if (yytype < YYNTOKENS) + YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep); +# else + YYUSE (yyoutput); +# endif + switch (yytype) + { + default: + break; + } +} + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep) +#else +static void +yy_symbol_print (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; +#endif +{ + if (yytype < YYNTOKENS) + YYFPRINTF (yyoutput, "token %s (", yytname[yytype]); + else + YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]); + + yy_symbol_value_print (yyoutput, yytype, yyvaluep); + YYFPRINTF (yyoutput, ")"); +} + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop) +#else +static void +yy_stack_print (yybottom, yytop) + yytype_int16 *yybottom; + yytype_int16 *yytop; +#endif +{ + YYFPRINTF (stderr, "Stack now"); + for (; yybottom <= yytop; yybottom++) + { + int yybot = *yybottom; + YYFPRINTF (stderr, " %d", yybot); + } + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (YYID (0)) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_reduce_print (YYSTYPE *yyvsp, int yyrule) +#else +static void +yy_reduce_print (yyvsp, yyrule) + YYSTYPE *yyvsp; + int yyrule; +#endif +{ + int yynrhs = yyr2[yyrule]; + int yyi; + unsigned long int yylno = yyrline[yyrule]; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n", + yyrule - 1, yylno); + /* The symbols being reduced. */ + for (yyi = 0; yyi < yynrhs; yyi++) + { + YYFPRINTF (stderr, " $%d = ", yyi + 1); + yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], + &(yyvsp[(yyi + 1) - (yynrhs)]) + ); + YYFPRINTF (stderr, "\n"); + } +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (yyvsp, Rule); \ +} while (YYID (0)) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + +#if YYERROR_VERBOSE + +# ifndef yystrlen +# if defined __GLIBC__ && defined _STRING_H +# define yystrlen strlen +# else +/* Return the length of YYSTR. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static YYSIZE_T +yystrlen (const char *yystr) +#else +static YYSIZE_T +yystrlen (yystr) + const char *yystr; +#endif +{ + YYSIZE_T yylen; + for (yylen = 0; yystr[yylen]; yylen++) + continue; + return yylen; +} +# endif +# endif + +# ifndef yystpcpy +# if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE +# define yystpcpy stpcpy +# else +/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in + YYDEST. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static char * +yystpcpy (char *yydest, const char *yysrc) +#else +static char * +yystpcpy (yydest, yysrc) + char *yydest; + const char *yysrc; +#endif +{ + char *yyd = yydest; + const char *yys = yysrc; + + while ((*yyd++ = *yys++) != '\0') + continue; + + return yyd - 1; +} +# endif +# endif + +# ifndef yytnamerr +/* Copy to YYRES the contents of YYSTR after stripping away unnecessary + quotes and backslashes, so that it's suitable for yyerror. The + heuristic is that double-quoting is unnecessary unless the string + contains an apostrophe, a comma, or backslash (other than + backslash-backslash). YYSTR is taken from yytname. If YYRES is + null, do not copy; instead, return the length of what the result + would have been. */ +static YYSIZE_T +yytnamerr (char *yyres, const char *yystr) +{ + if (*yystr == '"') + { + YYSIZE_T yyn = 0; + char const *yyp = yystr; + + for (;;) + switch (*++yyp) + { + case '\'': + case ',': + goto do_not_strip_quotes; + + case '\\': + if (*++yyp != '\\') + goto do_not_strip_quotes; + /* Fall through. */ + default: + if (yyres) + yyres[yyn] = *yyp; + yyn++; + break; + + case '"': + if (yyres) + yyres[yyn] = '\0'; + return yyn; + } + do_not_strip_quotes: ; + } + + if (! yyres) + return yystrlen (yystr); + + return yystpcpy (yyres, yystr) - yyres; +} +# endif + +/* Copy into *YYMSG, which is of size *YYMSG_ALLOC, an error message + about the unexpected token YYTOKEN for the state stack whose top is + YYSSP. + + Return 0 if *YYMSG was successfully written. Return 1 if *YYMSG is + not large enough to hold the message. In that case, also set + *YYMSG_ALLOC to the required number of bytes. Return 2 if the + required number of bytes is too large to store. */ +static int +yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg, + yytype_int16 *yyssp, int yytoken) +{ + YYSIZE_T yysize0 = yytnamerr (YY_NULL, yytname[yytoken]); + YYSIZE_T yysize = yysize0; + YYSIZE_T yysize1; + enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 }; + /* Internationalized format string. */ + const char *yyformat = YY_NULL; + /* Arguments of yyformat. */ + char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM]; + /* Number of reported tokens (one for the "unexpected", one per + "expected"). */ + int yycount = 0; + + /* There are many possibilities here to consider: + - Assume YYFAIL is not used. It's too flawed to consider. See + + for details. YYERROR is fine as it does not invoke this + function. + - If this state is a consistent state with a default action, then + the only way this function was invoked is if the default action + is an error action. In that case, don't check for expected + tokens because there are none. + - The only way there can be no lookahead present (in yychar) is if + this state is a consistent state with a default action. Thus, + detecting the absence of a lookahead is sufficient to determine + that there is no unexpected or expected token to report. In that + case, just report a simple "syntax error". + - Don't assume there isn't a lookahead just because this state is a + consistent state with a default action. There might have been a + previous inconsistent state, consistent state with a non-default + action, or user semantic action that manipulated yychar. + - Of course, the expected token list depends on states to have + correct lookahead information, and it depends on the parser not + to perform extra reductions after fetching a lookahead from the + scanner and before detecting a syntax error. Thus, state merging + (from LALR or IELR) and default reductions corrupt the expected + token list. However, the list is correct for canonical LR with + one exception: it will still contain any token that will not be + accepted due to an error action in a later state. + */ + if (yytoken != YYEMPTY) + { + int yyn = yypact[*yyssp]; + yyarg[yycount++] = yytname[yytoken]; + if (!yypact_value_is_default (yyn)) + { + /* Start YYX at -YYN if negative to avoid negative indexes in + YYCHECK. In other words, skip the first -YYN actions for + this state because they are default actions. */ + int yyxbegin = yyn < 0 ? -yyn : 0; + /* Stay within bounds of both yycheck and yytname. */ + int yychecklim = YYLAST - yyn + 1; + int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS; + int yyx; + + for (yyx = yyxbegin; yyx < yyxend; ++yyx) + if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR + && !yytable_value_is_error (yytable[yyx + yyn])) + { + if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM) + { + yycount = 1; + yysize = yysize0; + break; + } + yyarg[yycount++] = yytname[yyx]; + yysize1 = yysize + yytnamerr (YY_NULL, yytname[yyx]); + if (! (yysize <= yysize1 + && yysize1 <= YYSTACK_ALLOC_MAXIMUM)) + return 2; + yysize = yysize1; + } + } + } + + switch (yycount) + { +# define YYCASE_(N, S) \ + case N: \ + yyformat = S; \ + break + YYCASE_(0, YY_("syntax error")); + YYCASE_(1, YY_("syntax error, unexpected %s")); + YYCASE_(2, YY_("syntax error, unexpected %s, expecting %s")); + YYCASE_(3, YY_("syntax error, unexpected %s, expecting %s or %s")); + YYCASE_(4, YY_("syntax error, unexpected %s, expecting %s or %s or %s")); + YYCASE_(5, YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s")); +# undef YYCASE_ + } + + yysize1 = yysize + yystrlen (yyformat); + if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM)) + return 2; + yysize = yysize1; + + if (*yymsg_alloc < yysize) + { + *yymsg_alloc = 2 * yysize; + if (! (yysize <= *yymsg_alloc + && *yymsg_alloc <= YYSTACK_ALLOC_MAXIMUM)) + *yymsg_alloc = YYSTACK_ALLOC_MAXIMUM; + return 1; + } + + /* Avoid sprintf, as that infringes on the user's name space. + Don't have undefined behavior even if the translation + produced a string with the wrong number of "%s"s. */ + { + char *yyp = *yymsg; + int yyi = 0; + while ((*yyp = *yyformat) != '\0') + if (*yyp == '%' && yyformat[1] == 's' && yyi < yycount) + { + yyp += yytnamerr (yyp, yyarg[yyi++]); + yyformat += 2; + } + else + { + yyp++; + yyformat++; + } + } + return 0; +} +#endif /* YYERROR_VERBOSE */ + +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep) +#else +static void +yydestruct (yymsg, yytype, yyvaluep) + const char *yymsg; + int yytype; + YYSTYPE *yyvaluep; +#endif +{ + YYUSE (yyvaluep); + + if (!yymsg) + yymsg = "Deleting"; + YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp); + + switch (yytype) + { + case 57: /* "choice_entry" */ + + { + fprintf(stderr, "%s:%d: missing end statement for this entry\n", + (yyvaluep->menu)->file->name, (yyvaluep->menu)->lineno); + if (current_menu == (yyvaluep->menu)) + menu_end_menu(); +}; + + break; + case 63: /* "if_entry" */ + + { + fprintf(stderr, "%s:%d: missing end statement for this entry\n", + (yyvaluep->menu)->file->name, (yyvaluep->menu)->lineno); + if (current_menu == (yyvaluep->menu)) + menu_end_menu(); +}; + + break; + case 69: /* "menu_entry" */ + + { + fprintf(stderr, "%s:%d: missing end statement for this entry\n", + (yyvaluep->menu)->file->name, (yyvaluep->menu)->lineno); + if (current_menu == (yyvaluep->menu)) + menu_end_menu(); +}; + + break; + + default: + break; + } +} + + +/* Prevent warnings from -Wmissing-prototypes. */ +#ifdef YYPARSE_PARAM +#if defined __STDC__ || defined __cplusplus +int yyparse (void *YYPARSE_PARAM); +#else +int yyparse (); +#endif +#else /* ! YYPARSE_PARAM */ +#if defined __STDC__ || defined __cplusplus +int yyparse (void); +#else +int yyparse (); +#endif +#endif /* ! YYPARSE_PARAM */ + + +/* The lookahead symbol. */ +int yychar; + +/* The semantic value of the lookahead symbol. */ +YYSTYPE yylval; + +/* Number of syntax errors so far. */ +int yynerrs; + + +/*----------. +| yyparse. | +`----------*/ + +#ifdef YYPARSE_PARAM +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void *YYPARSE_PARAM) +#else +int +yyparse (YYPARSE_PARAM) + void *YYPARSE_PARAM; +#endif +#else /* ! YYPARSE_PARAM */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void) +#else +int +yyparse () + +#endif +#endif +{ + int yystate; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus; + + /* The stacks and their tools: + `yyss': related to states. + `yyvs': related to semantic values. + + Refer to the stacks through separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* The state stack. */ + yytype_int16 yyssa[YYINITDEPTH]; + yytype_int16 *yyss; + yytype_int16 *yyssp; + + /* The semantic value stack. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs; + YYSTYPE *yyvsp; + + YYSIZE_T yystacksize; + + int yyn; + int yyresult; + /* Lookahead token as an internal (translated) token number. */ + int yytoken; + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + +#if YYERROR_VERBOSE + /* Buffer for error messages, and its allocated size. */ + char yymsgbuf[128]; + char *yymsg = yymsgbuf; + YYSIZE_T yymsg_alloc = sizeof yymsgbuf; +#endif + +#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N)) + + /* The number of symbols on the RHS of the reduced rule. + Keep to zero when no symbol should be popped. */ + int yylen = 0; + + yytoken = 0; + yyss = yyssa; + yyvs = yyvsa; + yystacksize = YYINITDEPTH; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yystate = 0; + yyerrstatus = 0; + yynerrs = 0; + yychar = YYEMPTY; /* Cause a token to be read. */ + + /* Initialize stack pointers. + Waste one element of value and location stack + so that they stay on the same level as the state stack. + The wasted elements are never initialized. */ + yyssp = yyss; + yyvsp = yyvs; + + goto yysetstate; + +/*------------------------------------------------------------. +| yynewstate -- Push a new state, which is found in yystate. | +`------------------------------------------------------------*/ + yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. So pushing a state here evens the stacks. */ + yyssp++; + + yysetstate: + *yyssp = yystate; + + if (yyss + yystacksize - 1 <= yyssp) + { + /* Get the current used size of the three stacks, in elements. */ + YYSIZE_T yysize = yyssp - yyss + 1; + +#ifdef yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + YYSTYPE *yyvs1 = yyvs; + yytype_int16 *yyss1 = yyss; + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow (YY_("memory exhausted"), + &yyss1, yysize * sizeof (*yyssp), + &yyvs1, yysize * sizeof (*yyvsp), + &yystacksize); + + yyss = yyss1; + yyvs = yyvs1; + } +#else /* no yyoverflow */ +# ifndef YYSTACK_RELOCATE + goto yyexhaustedlab; +# else + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyexhaustedlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + yytype_int16 *yyss1 = yyss; + union yyalloc *yyptr = + (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize)); + if (! yyptr) + goto yyexhaustedlab; + YYSTACK_RELOCATE (yyss_alloc, yyss); + YYSTACK_RELOCATE (yyvs_alloc, yyvs); +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif +#endif /* no yyoverflow */ + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + YYDPRINTF ((stderr, "Stack size increased to %lu\n", + (unsigned long int) yystacksize)); + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } + + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + + if (yystate == YYFINAL) + YYACCEPT; + + goto yybackup; + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + + /* Do appropriate processing given the current state. Read a + lookahead token if we need one and don't already have one. */ + + /* First try to decide what to do without reference to lookahead token. */ + yyn = yypact[yystate]; + if (yypact_value_is_default (yyn)) + goto yydefault; + + /* Not known => get a lookahead token if don't already have one. */ + + /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token: ")); + yychar = YYLEX; + } + + if (yychar <= YYEOF) + { + yychar = yytoken = YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yytable_value_is_error (yyn)) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + /* Shift the lookahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + + /* Discard the shifted token. */ + yychar = YYEMPTY; + + yystate = yyn; + *++yyvsp = yylval; + + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- Do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + `$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 10: + + { zconf_error("unexpected end statement"); } + break; + + case 11: + + { zconf_error("unknown statement \"%s\"", (yyvsp[(2) - (4)].string)); } + break; + + case 12: + + { + zconf_error("unexpected option \"%s\"", kconf_id_strings + (yyvsp[(2) - (4)].id)->name); +} + break; + + case 13: + + { zconf_error("invalid statement"); } + break; + + case 28: + + { zconf_error("unknown option \"%s\"", (yyvsp[(1) - (3)].string)); } + break; + + case 29: + + { zconf_error("invalid option"); } + break; + + case 30: + + { + struct symbol *sym = sym_lookup((yyvsp[(2) - (3)].string), 0); + sym->flags |= SYMBOL_OPTIONAL; + menu_add_entry(sym); + printd(DEBUG_PARSE, "%s:%d:config %s\n", zconf_curname(), zconf_lineno(), (yyvsp[(2) - (3)].string)); +} + break; + + case 31: + + { + menu_end_entry(); + printd(DEBUG_PARSE, "%s:%d:endconfig\n", zconf_curname(), zconf_lineno()); +} + break; + + case 32: + + { + struct symbol *sym = sym_lookup((yyvsp[(2) - (3)].string), 0); + sym->flags |= SYMBOL_OPTIONAL; + menu_add_entry(sym); + printd(DEBUG_PARSE, "%s:%d:menuconfig %s\n", zconf_curname(), zconf_lineno(), (yyvsp[(2) - (3)].string)); +} + break; + + case 33: + + { + if (current_entry->prompt) + current_entry->prompt->type = P_MENU; + else + zconfprint("warning: menuconfig statement without prompt"); + menu_end_entry(); + printd(DEBUG_PARSE, "%s:%d:endconfig\n", zconf_curname(), zconf_lineno()); +} + break; + + case 41: + + { + menu_set_type((yyvsp[(1) - (3)].id)->stype); + printd(DEBUG_PARSE, "%s:%d:type(%u)\n", + zconf_curname(), zconf_lineno(), + (yyvsp[(1) - (3)].id)->stype); +} + break; + + case 42: + + { + menu_add_prompt(P_PROMPT, (yyvsp[(2) - (4)].string), (yyvsp[(3) - (4)].expr)); + printd(DEBUG_PARSE, "%s:%d:prompt\n", zconf_curname(), zconf_lineno()); +} + break; + + case 43: + + { + menu_add_expr(P_DEFAULT, (yyvsp[(2) - (4)].expr), (yyvsp[(3) - (4)].expr)); + if ((yyvsp[(1) - (4)].id)->stype != S_UNKNOWN) + menu_set_type((yyvsp[(1) - (4)].id)->stype); + printd(DEBUG_PARSE, "%s:%d:default(%u)\n", + zconf_curname(), zconf_lineno(), + (yyvsp[(1) - (4)].id)->stype); +} + break; + + case 44: + + { + menu_add_symbol(P_SELECT, sym_lookup((yyvsp[(2) - (4)].string), 0), (yyvsp[(3) - (4)].expr)); + printd(DEBUG_PARSE, "%s:%d:select\n", zconf_curname(), zconf_lineno()); +} + break; + + case 45: + + { + menu_add_expr(P_RANGE, expr_alloc_comp(E_RANGE,(yyvsp[(2) - (5)].symbol), (yyvsp[(3) - (5)].symbol)), (yyvsp[(4) - (5)].expr)); + printd(DEBUG_PARSE, "%s:%d:range\n", zconf_curname(), zconf_lineno()); +} + break; + + case 48: + + { + const struct kconf_id *id = kconf_id_lookup((yyvsp[(2) - (3)].string), strlen((yyvsp[(2) - (3)].string))); + if (id && id->flags & TF_OPTION) + menu_add_option(id->token, (yyvsp[(3) - (3)].string)); + else + zconfprint("warning: ignoring unknown option %s", (yyvsp[(2) - (3)].string)); + free((yyvsp[(2) - (3)].string)); +} + break; + + case 49: + + { (yyval.string) = NULL; } + break; + + case 50: + + { (yyval.string) = (yyvsp[(2) - (2)].string); } + break; + + case 51: + + { + struct symbol *sym = sym_lookup((yyvsp[(2) - (3)].string), SYMBOL_CHOICE); + sym->flags |= SYMBOL_AUTO; + menu_add_entry(sym); + menu_add_expr(P_CHOICE, NULL, NULL); + printd(DEBUG_PARSE, "%s:%d:choice\n", zconf_curname(), zconf_lineno()); +} + break; + + case 52: + + { + (yyval.menu) = menu_add_menu(); +} + break; + + case 53: + + { + if (zconf_endtoken((yyvsp[(1) - (1)].id), T_CHOICE, T_ENDCHOICE)) { + menu_end_menu(); + printd(DEBUG_PARSE, "%s:%d:endchoice\n", zconf_curname(), zconf_lineno()); + } +} + break; + + case 61: + + { + menu_add_prompt(P_PROMPT, (yyvsp[(2) - (4)].string), (yyvsp[(3) - (4)].expr)); + printd(DEBUG_PARSE, "%s:%d:prompt\n", zconf_curname(), zconf_lineno()); +} + break; + + case 62: + + { + if ((yyvsp[(1) - (3)].id)->stype == S_BOOLEAN || (yyvsp[(1) - (3)].id)->stype == S_TRISTATE) { + menu_set_type((yyvsp[(1) - (3)].id)->stype); + printd(DEBUG_PARSE, "%s:%d:type(%u)\n", + zconf_curname(), zconf_lineno(), + (yyvsp[(1) - (3)].id)->stype); + } else + YYERROR; +} + break; + + case 63: + + { + current_entry->sym->flags |= SYMBOL_OPTIONAL; + printd(DEBUG_PARSE, "%s:%d:optional\n", zconf_curname(), zconf_lineno()); +} + break; + + case 64: + + { + if ((yyvsp[(1) - (4)].id)->stype == S_UNKNOWN) { + menu_add_symbol(P_DEFAULT, sym_lookup((yyvsp[(2) - (4)].string), 0), (yyvsp[(3) - (4)].expr)); + printd(DEBUG_PARSE, "%s:%d:default\n", + zconf_curname(), zconf_lineno()); + } else + YYERROR; +} + break; + + case 67: + + { + printd(DEBUG_PARSE, "%s:%d:if\n", zconf_curname(), zconf_lineno()); + menu_add_entry(NULL); + menu_add_dep((yyvsp[(2) - (3)].expr)); + (yyval.menu) = menu_add_menu(); +} + break; + + case 68: + + { + if (zconf_endtoken((yyvsp[(1) - (1)].id), T_IF, T_ENDIF)) { + menu_end_menu(); + printd(DEBUG_PARSE, "%s:%d:endif\n", zconf_curname(), zconf_lineno()); + } +} + break; + + case 74: + + { + menu_add_prompt(P_MENU, (yyvsp[(2) - (3)].string), NULL); +} + break; + + case 75: + + { + menu_add_entry(NULL); + menu_add_prompt(P_MENU, (yyvsp[(2) - (3)].string), NULL); + printd(DEBUG_PARSE, "%s:%d:menu\n", zconf_curname(), zconf_lineno()); +} + break; + + case 76: + + { + (yyval.menu) = menu_add_menu(); +} + break; + + case 77: + + { + if (zconf_endtoken((yyvsp[(1) - (1)].id), T_MENU, T_ENDMENU)) { + menu_end_menu(); + printd(DEBUG_PARSE, "%s:%d:endmenu\n", zconf_curname(), zconf_lineno()); + } +} + break; + + case 83: + + { + printd(DEBUG_PARSE, "%s:%d:source %s\n", zconf_curname(), zconf_lineno(), (yyvsp[(2) - (3)].string)); + zconf_nextfile((yyvsp[(2) - (3)].string)); +} + break; + + case 84: + + { + menu_add_entry(NULL); + menu_add_prompt(P_COMMENT, (yyvsp[(2) - (3)].string), NULL); + printd(DEBUG_PARSE, "%s:%d:comment\n", zconf_curname(), zconf_lineno()); +} + break; + + case 85: + + { + menu_end_entry(); +} + break; + + case 86: + + { + printd(DEBUG_PARSE, "%s:%d:help\n", zconf_curname(), zconf_lineno()); + zconf_starthelp(); +} + break; + + case 87: + + { + current_entry->help = (yyvsp[(2) - (2)].string); +} + break; + + case 92: + + { + menu_add_dep((yyvsp[(3) - (4)].expr)); + printd(DEBUG_PARSE, "%s:%d:depends on\n", zconf_curname(), zconf_lineno()); +} + break; + + case 96: + + { + menu_add_visibility((yyvsp[(2) - (2)].expr)); +} + break; + + case 98: + + { + menu_add_prompt(P_PROMPT, (yyvsp[(1) - (2)].string), (yyvsp[(2) - (2)].expr)); +} + break; + + case 101: + + { (yyval.id) = (yyvsp[(1) - (2)].id); } + break; + + case 102: + + { (yyval.id) = (yyvsp[(1) - (2)].id); } + break; + + case 103: + + { (yyval.id) = (yyvsp[(1) - (2)].id); } + break; + + case 106: + + { (yyval.expr) = NULL; } + break; + + case 107: + + { (yyval.expr) = (yyvsp[(2) - (2)].expr); } + break; + + case 108: + + { (yyval.expr) = expr_alloc_symbol((yyvsp[(1) - (1)].symbol)); } + break; + + case 109: + + { (yyval.expr) = expr_alloc_comp(E_LTH, (yyvsp[(1) - (3)].symbol), (yyvsp[(3) - (3)].symbol)); } + break; + + case 110: + + { (yyval.expr) = expr_alloc_comp(E_LEQ, (yyvsp[(1) - (3)].symbol), (yyvsp[(3) - (3)].symbol)); } + break; + + case 111: + + { (yyval.expr) = expr_alloc_comp(E_GTH, (yyvsp[(1) - (3)].symbol), (yyvsp[(3) - (3)].symbol)); } + break; + + case 112: + + { (yyval.expr) = expr_alloc_comp(E_GEQ, (yyvsp[(1) - (3)].symbol), (yyvsp[(3) - (3)].symbol)); } + break; + + case 113: + + { (yyval.expr) = expr_alloc_comp(E_EQUAL, (yyvsp[(1) - (3)].symbol), (yyvsp[(3) - (3)].symbol)); } + break; + + case 114: + + { (yyval.expr) = expr_alloc_comp(E_UNEQUAL, (yyvsp[(1) - (3)].symbol), (yyvsp[(3) - (3)].symbol)); } + break; + + case 115: + + { (yyval.expr) = (yyvsp[(2) - (3)].expr); } + break; + + case 116: + + { (yyval.expr) = expr_alloc_one(E_NOT, (yyvsp[(2) - (2)].expr)); } + break; + + case 117: + + { (yyval.expr) = expr_alloc_two(E_OR, (yyvsp[(1) - (3)].expr), (yyvsp[(3) - (3)].expr)); } + break; + + case 118: + + { (yyval.expr) = expr_alloc_two(E_AND, (yyvsp[(1) - (3)].expr), (yyvsp[(3) - (3)].expr)); } + break; + + case 119: + + { (yyval.symbol) = sym_lookup((yyvsp[(1) - (1)].string), 0); free((yyvsp[(1) - (1)].string)); } + break; + + case 120: + + { (yyval.symbol) = sym_lookup((yyvsp[(1) - (1)].string), SYMBOL_CONST); free((yyvsp[(1) - (1)].string)); } + break; + + case 121: + + { (yyval.string) = NULL; } + break; + + + + default: break; + } + /* User semantic actions sometimes alter yychar, and that requires + that yytoken be updated with the new translation. We take the + approach of translating immediately before every use of yytoken. + One alternative is translating here after every semantic action, + but that translation would be missed if the semantic action invokes + YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or + if it invokes YYBACKUP. In the case of YYABORT or YYACCEPT, an + incorrect destructor might then be invoked immediately. In the + case of YYERROR or YYBACKUP, subsequent parser actions might lead + to an incorrect destructor call or verbose syntax error message + before the lookahead is translated. */ + YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc); + + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + + *++yyvsp = yyval; + + /* Now `shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + + yyn = yyr1[yyn]; + + yystate = yypgoto[yyn - YYNTOKENS] + *yyssp; + if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp) + yystate = yytable[yystate]; + else + yystate = yydefgoto[yyn - YYNTOKENS]; + + goto yynewstate; + + +/*------------------------------------. +| yyerrlab -- here on detecting error | +`------------------------------------*/ +yyerrlab: + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = yychar == YYEMPTY ? YYEMPTY : YYTRANSLATE (yychar); + + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; +#if ! YYERROR_VERBOSE + yyerror (YY_("syntax error")); +#else +# define YYSYNTAX_ERROR yysyntax_error (&yymsg_alloc, &yymsg, \ + yyssp, yytoken) + { + char const *yymsgp = YY_("syntax error"); + int yysyntax_error_status; + yysyntax_error_status = YYSYNTAX_ERROR; + if (yysyntax_error_status == 0) + yymsgp = yymsg; + else if (yysyntax_error_status == 1) + { + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); + yymsg = (char *) YYSTACK_ALLOC (yymsg_alloc); + if (!yymsg) + { + yymsg = yymsgbuf; + yymsg_alloc = sizeof yymsgbuf; + yysyntax_error_status = 2; + } + else + { + yysyntax_error_status = YYSYNTAX_ERROR; + yymsgp = yymsg; + } + } + yyerror (yymsgp); + if (yysyntax_error_status == 2) + goto yyexhaustedlab; + } +# undef YYSYNTAX_ERROR +#endif + } + + + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse lookahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* Return failure if at end of input. */ + if (yychar == YYEOF) + YYABORT; + } + else + { + yydestruct ("Error: discarding", + yytoken, &yylval); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse lookahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + + /* Pacify compilers like GCC when the user code never invokes + YYERROR and the label yyerrorlab therefore never appears in user + code. */ + if (/*CONSTCOND*/ 0) + goto yyerrorlab; + + /* Do not reclaim the symbols of the rule which action triggered + this YYERROR. */ + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + for (;;) + { + yyn = yypact[yystate]; + if (!yypact_value_is_default (yyn)) + { + yyn += YYTERROR; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + YYABORT; + + + yydestruct ("Error: popping", + yystos[yystate], yyvsp); + YYPOPSTACK (1); + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + *++yyvsp = yylval; + + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturn; + +#if !defined yyoverflow || YYERROR_VERBOSE +/*-------------------------------------------------. +| yyexhaustedlab -- memory exhaustion comes here. | +`-------------------------------------------------*/ +yyexhaustedlab: + yyerror (YY_("memory exhausted")); + yyresult = 2; + /* Fall through. */ +#endif + +yyreturn: + if (yychar != YYEMPTY) + { + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = YYTRANSLATE (yychar); + yydestruct ("Cleanup: discarding lookahead", + yytoken, &yylval); + } + /* Do not reclaim the symbols of the rule which action triggered + this YYABORT or YYACCEPT. */ + YYPOPSTACK (yylen); + YY_STACK_PRINT (yyss, yyssp); + while (yyssp != yyss) + { + yydestruct ("Cleanup: popping", + yystos[*yyssp], yyvsp); + YYPOPSTACK (1); + } +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif +#if YYERROR_VERBOSE + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); +#endif + /* Make sure YYID is used. */ + return YYID (yyresult); +} + + + + + +void conf_parse(const char *name) +{ + struct symbol *sym; + int i; + + zconf_initscan(name); + + sym_init(); + _menu_init(); + rootmenu.prompt = menu_add_prompt(P_MENU, "Linux Kernel Configuration", NULL); + + if (getenv("ZCONF_DEBUG")) + zconfdebug = 1; + zconfparse(); + if (zconfnerrs) + exit(1); + if (!modules_sym) + modules_sym = sym_find( "n" ); + + rootmenu.prompt->text = _(rootmenu.prompt->text); + rootmenu.prompt->text = sym_expand_string_value(rootmenu.prompt->text); + + menu_finalize(&rootmenu); + for_all_symbols(i, sym) { + if (sym_check_deps(sym)) + zconfnerrs++; + } + if (zconfnerrs) + exit(1); + sym_set_change_count(1); +} + +static const char *zconf_tokenname(int token) +{ + switch (token) { + case T_MENU: return "menu"; + case T_ENDMENU: return "endmenu"; + case T_CHOICE: return "choice"; + case T_ENDCHOICE: return "endchoice"; + case T_IF: return "if"; + case T_ENDIF: return "endif"; + case T_DEPENDS: return "depends"; + case T_VISIBLE: return "visible"; + } + return ""; +} + +static bool zconf_endtoken(const struct kconf_id *id, int starttoken, int endtoken) +{ + if (id->token != endtoken) { + zconf_error("unexpected '%s' within %s block", + kconf_id_strings + id->name, zconf_tokenname(starttoken)); + zconfnerrs++; + return false; + } + if (current_menu->file != current_file) { + zconf_error("'%s' in different file than '%s'", + kconf_id_strings + id->name, zconf_tokenname(starttoken)); + fprintf(stderr, "%s:%d: location of the '%s'\n", + current_menu->file->name, current_menu->lineno, + zconf_tokenname(starttoken)); + zconfnerrs++; + return false; + } + return true; +} + +static void zconfprint(const char *err, ...) +{ + va_list ap; + + fprintf(stderr, "%s:%d: ", zconf_curname(), zconf_lineno()); + va_start(ap, err); + vfprintf(stderr, err, ap); + va_end(ap); + fprintf(stderr, "\n"); +} + +static void zconf_error(const char *err, ...) +{ + va_list ap; + + zconfnerrs++; + fprintf(stderr, "%s:%d: ", zconf_curname(), zconf_lineno()); + va_start(ap, err); + vfprintf(stderr, err, ap); + va_end(ap); + fprintf(stderr, "\n"); +} + +static void zconferror(const char *err) +{ + fprintf(stderr, "%s:%d: %s\n", zconf_curname(), zconf_lineno() + 1, err); +} + +static void print_quoted_string(FILE *out, const char *str) +{ + const char *p; + int len; + + putc('"', out); + while ((p = strchr(str, '"'))) { + len = p - str; + if (len) + fprintf(out, "%.*s", len, str); + fputs("\\\"", out); + str = p + 1; + } + fputs(str, out); + putc('"', out); +} + +static void print_symbol(FILE *out, struct menu *menu) +{ + struct symbol *sym = menu->sym; + struct property *prop; + + if (sym_is_choice(sym)) + fprintf(out, "\nchoice\n"); + else + fprintf(out, "\nconfig %s\n", sym->name); + switch (sym->type) { + case S_BOOLEAN: + fputs(" boolean\n", out); + break; + case S_TRISTATE: + fputs(" tristate\n", out); + break; + case S_STRING: + fputs(" string\n", out); + break; + case S_INT: + fputs(" integer\n", out); + break; + case S_HEX: + fputs(" hex\n", out); + break; + default: + fputs(" ???\n", out); + break; + } + for (prop = sym->prop; prop; prop = prop->next) { + if (prop->menu != menu) + continue; + switch (prop->type) { + case P_PROMPT: + fputs(" prompt ", out); + print_quoted_string(out, prop->text); + if (!expr_is_yes(prop->visible.expr)) { + fputs(" if ", out); + expr_fprint(prop->visible.expr, out); + } + fputc('\n', out); + break; + case P_DEFAULT: + fputs( " default ", out); + expr_fprint(prop->expr, out); + if (!expr_is_yes(prop->visible.expr)) { + fputs(" if ", out); + expr_fprint(prop->visible.expr, out); + } + fputc('\n', out); + break; + case P_CHOICE: + fputs(" #choice value\n", out); + break; + case P_SELECT: + fputs( " select ", out); + expr_fprint(prop->expr, out); + fputc('\n', out); + break; + case P_RANGE: + fputs( " range ", out); + expr_fprint(prop->expr, out); + fputc('\n', out); + break; + case P_MENU: + fputs( " menu ", out); + print_quoted_string(out, prop->text); + fputc('\n', out); + break; + default: + fprintf(out, " unknown prop %d!\n", prop->type); + break; + } + } + if (menu->help) { + int len = strlen(menu->help); + while (menu->help[--len] == '\n') + menu->help[len] = 0; + fprintf(out, " help\n%s\n", menu->help); + } +} + +void zconfdump(FILE *out) +{ + struct property *prop; + struct symbol *sym; + struct menu *menu; + + menu = rootmenu.list; + while (menu) { + if ((sym = menu->sym)) + print_symbol(out, menu); + else if ((prop = menu->prompt)) { + switch (prop->type) { + case P_COMMENT: + fputs("\ncomment ", out); + print_quoted_string(out, prop->text); + fputs("\n", out); + break; + case P_MENU: + fputs("\nmenu ", out); + print_quoted_string(out, prop->text); + fputs("\n", out); + break; + default: + ; + } + if (!expr_is_yes(prop->visible.expr)) { + fputs(" depends ", out); + expr_fprint(prop->visible.expr, out); + fputc('\n', out); + } + } + + if (menu->list) + menu = menu->list; + else if (menu->next) + menu = menu->next; + else while ((menu = menu->parent)) { + if (menu->prompt && menu->prompt->type == P_MENU) + fputs("\nendmenu\n", out); + if (menu->next) { + menu = menu->next; + break; + } + } + } +} + +#include "zconf.lex.c" +#include "util.c" +#include "confdata.c" +#include "expr.c" +#include "symbol.c" +#include "menu.c" + Binary files linux-4.2/scripts/kconfig/zconf.tab.o and linux-4.2-pck/scripts/kconfig/zconf.tab.o differ diff -Nur linux-4.2/scripts/mkcompile_h linux-4.2-pck/scripts/mkcompile_h --- linux-4.2/scripts/mkcompile_h 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/scripts/mkcompile_h 2015-09-08 00:48:22.245029757 -0300 @@ -54,8 +54,8 @@ fi UTS_VERSION="#$VERSION" -CONFIG_FLAGS="" -if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi +CONFIG_FLAGS="PCK" +if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" diff -Nur linux-4.2/scripts/tuxonice_output_to_csv.sh linux-4.2-pck/scripts/tuxonice_output_to_csv.sh --- linux-4.2/scripts/tuxonice_output_to_csv.sh 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/scripts/tuxonice_output_to_csv.sh 2015-09-08 11:20:32.583672945 -0300 @@ -0,0 +1,35 @@ +#!/bin/bash + +cat $1 | grep "\*TOI\*" | cut -b 22- | sed "s/ /,/g" | sed "s/\.//" | sort -n > $1.tmp +COLUMNS=$(cat $1.tmp | awk -F ',' ' { print $2 } ' | sort | uniq) +echo -n "pfn," > $1.tmp2 +for NAME in $COLUMNS; do + echo -n "$NAME," >> $1.tmp2 +done +echo >> $1.tmp2 +FIRST=1 +declare -A data +while IFS=, read -r pfn column value; do + if [ $FIRST -eq 1 ]; then + FIRST=0 + LAST_PFN=$pfn + fi + if [ $pfn -ne $LAST_PFN ]; then + echo -n "$LAST_PFN," >> $1.tmp2; + for NAME in $COLUMNS; do + echo -n "${data[$NAME]}," >> $1.tmp2 + done + data=( ) + echo >> $1.tmp2 + LAST_PFN=$pfn + fi + if [ -z "$value" ]; then + data[$column]=X + else + data[$column]=$value + fi +done < $1.tmp +mv $1.tmp2 $1.csv +rm $1.tmp +LIBREOFFICE=$(which libreoffice) +[ -n "$LIBREOFFICE" ] && libreoffice $1.csv & diff -Nur linux-4.2/security/keys/big_key.c linux-4.2-pck/security/keys/big_key.c --- linux-4.2/security/keys/big_key.c 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/security/keys/big_key.c 2015-09-08 11:20:32.627004164 -0300 @@ -68,7 +68,7 @@ * * TODO: Encrypt the stored data with a temporary key. */ - file = shmem_kernel_file_setup("", datalen, 0); + file = shmem_kernel_file_setup("", datalen, 0, 0); if (IS_ERR(file)) { ret = PTR_ERR(file); goto error; diff -Nur linux-4.2/tools/testing/selftests/Makefile linux-4.2-pck/tools/testing/selftests/Makefile --- linux-4.2/tools/testing/selftests/Makefile 2015-08-30 15:34:09.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/Makefile 2015-09-08 00:48:22.245029757 -0300 @@ -6,6 +6,7 @@ TARGETS += ftrace TARGETS += futex TARGETS += kcmp +TARGETS += kdbus TARGETS += memfd TARGETS += memory-hotplug TARGETS += mount diff -Nur linux-4.2/tools/testing/selftests/kdbus/.gitignore linux-4.2-pck/tools/testing/selftests/kdbus/.gitignore --- linux-4.2/tools/testing/selftests/kdbus/.gitignore 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/.gitignore 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1 @@ +kdbus-test diff -Nur linux-4.2/tools/testing/selftests/kdbus/Makefile linux-4.2-pck/tools/testing/selftests/kdbus/Makefile --- linux-4.2/tools/testing/selftests/kdbus/Makefile 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/Makefile 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,49 @@ +CFLAGS += -I../../../../usr/include/ +CFLAGS += -I../../../../samples/kdbus/ +CFLAGS += -I../../../../include/uapi/ +CFLAGS += -std=gnu99 +CFLAGS += -DKBUILD_MODNAME=\"kdbus\" -D_GNU_SOURCE +LDLIBS = -pthread -lcap -lm + +OBJS= \ + kdbus-enum.o \ + kdbus-util.o \ + kdbus-test.o \ + kdbus-test.o \ + test-activator.o \ + test-benchmark.o \ + test-bus.o \ + test-chat.o \ + test-connection.o \ + test-daemon.o \ + test-endpoint.o \ + test-fd.o \ + test-free.o \ + test-match.o \ + test-message.o \ + test-metadata-ns.o \ + test-monitor.o \ + test-names.o \ + test-policy.o \ + test-policy-ns.o \ + test-policy-priv.o \ + test-sync.o \ + test-timeout.o + +all: kdbus-test + +include ../lib.mk + +%.o: %.c kdbus-enum.h kdbus-test.h kdbus-util.h + $(CC) $(CFLAGS) -c $< -o $@ + +kdbus-test: $(OBJS) + $(CC) $(CFLAGS) $^ $(LDLIBS) -o $@ + +TEST_PROGS := kdbus-test + +run_tests: + ./kdbus-test --tap + +clean: + rm -f *.o kdbus-test diff -Nur linux-4.2/tools/testing/selftests/kdbus/kdbus-enum.c linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-enum.c --- linux-4.2/tools/testing/selftests/kdbus/kdbus-enum.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-enum.c 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kdbus-util.h" +#include "kdbus-enum.h" + +struct kdbus_enum_table { + long long id; + const char *name; +}; + +#define TABLE(what) static struct kdbus_enum_table kdbus_table_##what[] +#define ENUM(_id) { .id = _id, .name = STRINGIFY(_id) } +#define LOOKUP(what) \ + const char *enum_##what(long long id) \ + { \ + for (size_t i = 0; i < ELEMENTSOF(kdbus_table_##what); i++) \ + if (id == kdbus_table_##what[i].id) \ + return kdbus_table_##what[i].name; \ + return "UNKNOWN"; \ + } + +TABLE(CMD) = { + ENUM(KDBUS_CMD_BUS_MAKE), + ENUM(KDBUS_CMD_ENDPOINT_MAKE), + ENUM(KDBUS_CMD_HELLO), + ENUM(KDBUS_CMD_SEND), + ENUM(KDBUS_CMD_RECV), + ENUM(KDBUS_CMD_LIST), + ENUM(KDBUS_CMD_NAME_RELEASE), + ENUM(KDBUS_CMD_CONN_INFO), + ENUM(KDBUS_CMD_MATCH_ADD), + ENUM(KDBUS_CMD_MATCH_REMOVE), +}; +LOOKUP(CMD); + +TABLE(MSG) = { + ENUM(_KDBUS_ITEM_NULL), + ENUM(KDBUS_ITEM_PAYLOAD_VEC), + ENUM(KDBUS_ITEM_PAYLOAD_OFF), + ENUM(KDBUS_ITEM_PAYLOAD_MEMFD), + ENUM(KDBUS_ITEM_FDS), + ENUM(KDBUS_ITEM_BLOOM_PARAMETER), + ENUM(KDBUS_ITEM_BLOOM_FILTER), + ENUM(KDBUS_ITEM_DST_NAME), + ENUM(KDBUS_ITEM_MAKE_NAME), + ENUM(KDBUS_ITEM_ATTACH_FLAGS_SEND), + ENUM(KDBUS_ITEM_ATTACH_FLAGS_RECV), + ENUM(KDBUS_ITEM_ID), + ENUM(KDBUS_ITEM_NAME), + ENUM(KDBUS_ITEM_TIMESTAMP), + ENUM(KDBUS_ITEM_CREDS), + ENUM(KDBUS_ITEM_PIDS), + ENUM(KDBUS_ITEM_AUXGROUPS), + ENUM(KDBUS_ITEM_OWNED_NAME), + ENUM(KDBUS_ITEM_TID_COMM), + ENUM(KDBUS_ITEM_PID_COMM), + ENUM(KDBUS_ITEM_EXE), + ENUM(KDBUS_ITEM_CMDLINE), + ENUM(KDBUS_ITEM_CGROUP), + ENUM(KDBUS_ITEM_CAPS), + ENUM(KDBUS_ITEM_SECLABEL), + ENUM(KDBUS_ITEM_AUDIT), + ENUM(KDBUS_ITEM_CONN_DESCRIPTION), + ENUM(KDBUS_ITEM_NAME_ADD), + ENUM(KDBUS_ITEM_NAME_REMOVE), + ENUM(KDBUS_ITEM_NAME_CHANGE), + ENUM(KDBUS_ITEM_ID_ADD), + ENUM(KDBUS_ITEM_ID_REMOVE), + ENUM(KDBUS_ITEM_REPLY_TIMEOUT), + ENUM(KDBUS_ITEM_REPLY_DEAD), +}; +LOOKUP(MSG); + +TABLE(PAYLOAD) = { + ENUM(KDBUS_PAYLOAD_KERNEL), + ENUM(KDBUS_PAYLOAD_DBUS), +}; +LOOKUP(PAYLOAD); diff -Nur linux-4.2/tools/testing/selftests/kdbus/kdbus-enum.h linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-enum.h --- linux-4.2/tools/testing/selftests/kdbus/kdbus-enum.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-enum.h 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,15 @@ +/* + * Copyright (C) 2013-2015 Kay Sievers + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#pragma once + +const char *enum_CMD(long long id); +const char *enum_MSG(long long id); +const char *enum_MATCH(long long id); +const char *enum_PAYLOAD(long long id); diff -Nur linux-4.2/tools/testing/selftests/kdbus/kdbus-test.c linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-test.c --- linux-4.2/tools/testing/selftests/kdbus/kdbus-test.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-test.c 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,905 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kdbus-util.h" +#include "kdbus-enum.h" +#include "kdbus-test.h" + +enum { + TEST_CREATE_BUS = 1 << 0, + TEST_CREATE_CONN = 1 << 1, +}; + +struct kdbus_test { + const char *name; + const char *desc; + int (*func)(struct kdbus_test_env *env); + unsigned int flags; +}; + +struct kdbus_test_args { + bool mntns; + bool pidns; + bool userns; + char *uid_map; + char *gid_map; + int loop; + int wait; + int fork; + int tap_output; + char *module; + char *root; + char *test; + char *busname; +}; + +static const struct kdbus_test tests[] = { + { + .name = "bus-make", + .desc = "bus make functions", + .func = kdbus_test_bus_make, + .flags = 0, + }, + { + .name = "hello", + .desc = "the HELLO command", + .func = kdbus_test_hello, + .flags = TEST_CREATE_BUS, + }, + { + .name = "byebye", + .desc = "the BYEBYE command", + .func = kdbus_test_byebye, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "chat", + .desc = "a chat pattern", + .func = kdbus_test_chat, + .flags = TEST_CREATE_BUS, + }, + { + .name = "daemon", + .desc = "a simple daemon", + .func = kdbus_test_daemon, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "fd-passing", + .desc = "file descriptor passing", + .func = kdbus_test_fd_passing, + .flags = TEST_CREATE_BUS, + }, + { + .name = "endpoint", + .desc = "custom endpoint", + .func = kdbus_test_custom_endpoint, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "monitor", + .desc = "monitor functionality", + .func = kdbus_test_monitor, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "name-basics", + .desc = "basic name registry functions", + .func = kdbus_test_name_basic, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "name-conflict", + .desc = "name registry conflict details", + .func = kdbus_test_name_conflict, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "name-queue", + .desc = "queuing of names", + .func = kdbus_test_name_queue, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "name-takeover", + .desc = "takeover of names", + .func = kdbus_test_name_takeover, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "message-basic", + .desc = "basic message handling", + .func = kdbus_test_message_basic, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "message-prio", + .desc = "handling of messages with priority", + .func = kdbus_test_message_prio, + .flags = TEST_CREATE_BUS, + }, + { + .name = "message-quota", + .desc = "message quotas are enforced", + .func = kdbus_test_message_quota, + .flags = TEST_CREATE_BUS, + }, + { + .name = "memory-access", + .desc = "memory access", + .func = kdbus_test_memory_access, + .flags = TEST_CREATE_BUS, + }, + { + .name = "timeout", + .desc = "timeout", + .func = kdbus_test_timeout, + .flags = TEST_CREATE_BUS, + }, + { + .name = "sync-byebye", + .desc = "synchronous replies vs. BYEBYE", + .func = kdbus_test_sync_byebye, + .flags = TEST_CREATE_BUS, + }, + { + .name = "sync-reply", + .desc = "synchronous replies", + .func = kdbus_test_sync_reply, + .flags = TEST_CREATE_BUS, + }, + { + .name = "message-free", + .desc = "freeing of memory", + .func = kdbus_test_free, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "connection-info", + .desc = "retrieving connection information", + .func = kdbus_test_conn_info, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "connection-update", + .desc = "updating connection information", + .func = kdbus_test_conn_update, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "writable-pool", + .desc = "verifying pools are never writable", + .func = kdbus_test_writable_pool, + .flags = TEST_CREATE_BUS, + }, + { + .name = "policy", + .desc = "policy", + .func = kdbus_test_policy, + .flags = TEST_CREATE_BUS, + }, + { + .name = "policy-priv", + .desc = "unprivileged bus access", + .func = kdbus_test_policy_priv, + .flags = TEST_CREATE_BUS, + }, + { + .name = "policy-ns", + .desc = "policy in user namespaces", + .func = kdbus_test_policy_ns, + .flags = TEST_CREATE_BUS, + }, + { + .name = "metadata-ns", + .desc = "metadata in different namespaces", + .func = kdbus_test_metadata_ns, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-id-add", + .desc = "adding of matches by id", + .func = kdbus_test_match_id_add, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-id-remove", + .desc = "removing of matches by id", + .func = kdbus_test_match_id_remove, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-replace", + .desc = "replace of matches with the same cookie", + .func = kdbus_test_match_replace, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-name-add", + .desc = "adding of matches by name", + .func = kdbus_test_match_name_add, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-name-remove", + .desc = "removing of matches by name", + .func = kdbus_test_match_name_remove, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-name-change", + .desc = "matching for name changes", + .func = kdbus_test_match_name_change, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "match-bloom", + .desc = "matching with bloom filters", + .func = kdbus_test_match_bloom, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "activator", + .desc = "activator connections", + .func = kdbus_test_activator, + .flags = TEST_CREATE_BUS | TEST_CREATE_CONN, + }, + { + .name = "benchmark", + .desc = "benchmark", + .func = kdbus_test_benchmark, + .flags = TEST_CREATE_BUS, + }, + { + .name = "benchmark-nomemfds", + .desc = "benchmark without using memfds", + .func = kdbus_test_benchmark_nomemfds, + .flags = TEST_CREATE_BUS, + }, + { + .name = "benchmark-uds", + .desc = "benchmark comparison to UDS", + .func = kdbus_test_benchmark_uds, + .flags = TEST_CREATE_BUS, + }, +}; + +#define N_TESTS ((int) (sizeof(tests) / sizeof(tests[0]))) + +static int test_prepare_env(const struct kdbus_test *t, + const struct kdbus_test_args *args, + struct kdbus_test_env *env) +{ + if (t->flags & TEST_CREATE_BUS) { + char *s; + char *n = NULL; + int ret; + + asprintf(&s, "%s/control", args->root); + + env->control_fd = open(s, O_RDWR); + free(s); + ASSERT_RETURN(env->control_fd >= 0); + + if (!args->busname) { + n = unique_name("test-bus"); + ASSERT_RETURN(n); + } + + ret = kdbus_create_bus(env->control_fd, + args->busname ?: n, + _KDBUS_ATTACH_ALL, &s); + free(n); + ASSERT_RETURN(ret == 0); + + asprintf(&env->buspath, "%s/%s/bus", args->root, s); + free(s); + } + + if (t->flags & TEST_CREATE_CONN) { + env->conn = kdbus_hello(env->buspath, 0, NULL, 0); + ASSERT_RETURN(env->conn); + } + + env->root = args->root; + env->module = args->module; + + return 0; +} + +void test_unprepare_env(const struct kdbus_test *t, struct kdbus_test_env *env) +{ + if (env->conn) { + kdbus_conn_free(env->conn); + env->conn = NULL; + } + + if (env->control_fd >= 0) { + close(env->control_fd); + env->control_fd = -1; + } + + if (env->buspath) { + free(env->buspath); + env->buspath = NULL; + } +} + +static int test_run(const struct kdbus_test *t, + const struct kdbus_test_args *kdbus_args, + int wait) +{ + int ret; + struct kdbus_test_env env = {}; + + ret = test_prepare_env(t, kdbus_args, &env); + if (ret != TEST_OK) + return ret; + + if (wait > 0) { + printf("Sleeping %d seconds before running test ...\n", wait); + sleep(wait); + } + + ret = t->func(&env); + test_unprepare_env(t, &env); + return ret; +} + +static int test_run_forked(const struct kdbus_test *t, + const struct kdbus_test_args *kdbus_args, + int wait) +{ + int ret; + pid_t pid; + + pid = fork(); + if (pid < 0) { + return TEST_ERR; + } else if (pid == 0) { + ret = test_run(t, kdbus_args, wait); + _exit(ret); + } + + pid = waitpid(pid, &ret, 0); + if (pid <= 0) + return TEST_ERR; + else if (!WIFEXITED(ret)) + return TEST_ERR; + else + return WEXITSTATUS(ret); +} + +static void print_test_result(int ret) +{ + switch (ret) { + case TEST_OK: + printf("OK"); + break; + case TEST_SKIP: + printf("SKIPPED"); + break; + case TEST_ERR: + printf("ERROR"); + break; + } +} + +static int start_all_tests(struct kdbus_test_args *kdbus_args) +{ + int ret; + unsigned int fail_cnt = 0; + unsigned int skip_cnt = 0; + unsigned int ok_cnt = 0; + unsigned int i; + + if (kdbus_args->tap_output) { + printf("1..%d\n", N_TESTS); + fflush(stdout); + } + + kdbus_util_verbose = false; + + for (i = 0; i < N_TESTS; i++) { + const struct kdbus_test *t = tests + i; + + if (!kdbus_args->tap_output) { + unsigned int n; + + printf("Testing %s (%s) ", t->desc, t->name); + for (n = 0; n < 60 - strlen(t->desc) - strlen(t->name); n++) + printf("."); + printf(" "); + } + + ret = test_run_forked(t, kdbus_args, 0); + switch (ret) { + case TEST_OK: + ok_cnt++; + break; + case TEST_SKIP: + skip_cnt++; + break; + case TEST_ERR: + fail_cnt++; + break; + } + + if (kdbus_args->tap_output) { + printf("%sok %d - %s%s (%s)\n", + (ret == TEST_ERR) ? "not " : "", i + 1, + (ret == TEST_SKIP) ? "# SKIP " : "", + t->desc, t->name); + fflush(stdout); + } else { + print_test_result(ret); + printf("\n"); + } + } + + if (kdbus_args->tap_output) + printf("Failed %d/%d tests, %.2f%% okay\n", fail_cnt, N_TESTS, + 100.0 - (fail_cnt * 100.0) / ((float) N_TESTS)); + else + printf("\nSUMMARY: %u tests passed, %u skipped, %u failed\n", + ok_cnt, skip_cnt, fail_cnt); + + return fail_cnt > 0 ? TEST_ERR : TEST_OK; +} + +static int start_one_test(struct kdbus_test_args *kdbus_args) +{ + int i, ret; + bool test_found = false; + + for (i = 0; i < N_TESTS; i++) { + const struct kdbus_test *t = tests + i; + + if (strcmp(t->name, kdbus_args->test)) + continue; + + do { + test_found = true; + if (kdbus_args->fork) + ret = test_run_forked(t, kdbus_args, + kdbus_args->wait); + else + ret = test_run(t, kdbus_args, + kdbus_args->wait); + + printf("Testing %s: ", t->desc); + print_test_result(ret); + printf("\n"); + + if (ret != TEST_OK) + break; + } while (kdbus_args->loop); + + return ret; + } + + if (!test_found) { + printf("Unknown test-id '%s'\n", kdbus_args->test); + return TEST_ERR; + } + + return TEST_OK; +} + +static void usage(const char *argv0) +{ + unsigned int i, j; + + printf("Usage: %s [options]\n" + "Options:\n" + "\t-a, --tap Output test results in TAP format\n" + "\t-m, --module Kdbus module name\n" + "\t-x, --loop Run in a loop\n" + "\t-f, --fork Fork before running a test\n" + "\t-h, --help Print this help\n" + "\t-r, --root Toplevel of the kdbus hierarchy\n" + "\t-t, --test Run one specific test only, in verbose mode\n" + "\t-b, --bus Instead of generating a random bus name, take .\n" + "\t-w, --wait Wait before actually starting test\n" + "\t --mntns New mount namespace\n" + "\t --pidns New PID namespace\n" + "\t --userns New user namespace\n" + "\t --uidmap uid_map UID map for user namespace\n" + "\t --gidmap gid_map GID map for user namespace\n" + "\n", argv0); + + printf("By default, all test are run once, and a summary is printed.\n" + "Available tests for --test:\n\n"); + + for (i = 0; i < N_TESTS; i++) { + const struct kdbus_test *t = tests + i; + + printf("\t%s", t->name); + + for (j = 0; j < 24 - strlen(t->name); j++) + printf(" "); + + printf("Test %s\n", t->desc); + } + + printf("\n"); + printf("Note that some tests may, if run specifically by --test, " + "behave differently, and not terminate by themselves.\n"); + + exit(EXIT_FAILURE); +} + +void print_kdbus_test_args(struct kdbus_test_args *args) +{ + if (args->userns || args->pidns || args->mntns) + printf("# Starting tests in new %s%s%s namespaces%s\n", + args->mntns ? "MOUNT " : "", + args->pidns ? "PID " : "", + args->userns ? "USER " : "", + args->mntns ? ", kdbusfs will be remounted" : ""); + else + printf("# Starting tests in the same namespaces\n"); +} + +void print_metadata_support(void) +{ + bool no_meta_audit, no_meta_cgroups, no_meta_seclabel; + + /* + * KDBUS_ATTACH_CGROUP, KDBUS_ATTACH_AUDIT and + * KDBUS_ATTACH_SECLABEL + */ + no_meta_audit = !config_auditsyscall_is_enabled(); + no_meta_cgroups = !config_cgroups_is_enabled(); + no_meta_seclabel = !config_security_is_enabled(); + + if (no_meta_audit | no_meta_cgroups | no_meta_seclabel) + printf("# Starting tests without %s%s%s metadata support\n", + no_meta_audit ? "AUDIT " : "", + no_meta_cgroups ? "CGROUP " : "", + no_meta_seclabel ? "SECLABEL " : ""); + else + printf("# Starting tests with full metadata support\n"); +} + +int run_tests(struct kdbus_test_args *kdbus_args) +{ + int ret; + static char control[4096]; + + snprintf(control, sizeof(control), "%s/control", kdbus_args->root); + + if (access(control, W_OK) < 0) { + printf("Unable to locate control node at '%s'.\n", + control); + return TEST_ERR; + } + + if (kdbus_args->test) { + ret = start_one_test(kdbus_args); + } else { + do { + ret = start_all_tests(kdbus_args); + if (ret != TEST_OK) + break; + } while (kdbus_args->loop); + } + + return ret; +} + +static void nop_handler(int sig) {} + +static int test_prepare_mounts(struct kdbus_test_args *kdbus_args) +{ + int ret; + char kdbusfs[64] = {'\0'}; + + snprintf(kdbusfs, sizeof(kdbusfs), "%sfs", kdbus_args->module); + + /* make current mount slave */ + ret = mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL); + if (ret < 0) { + ret = -errno; + printf("error mount() root: %d (%m)\n", ret); + return ret; + } + + /* Remount procfs since we need it in our tests */ + if (kdbus_args->pidns) { + ret = mount("proc", "/proc", "proc", + MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (ret < 0) { + ret = -errno; + printf("error mount() /proc : %d (%m)\n", ret); + return ret; + } + } + + /* Remount kdbusfs */ + ret = mount(kdbusfs, kdbus_args->root, kdbusfs, + MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL); + if (ret < 0) { + ret = -errno; + printf("error mount() %s :%d (%m)\n", kdbusfs, ret); + return ret; + } + + return 0; +} + +int run_tests_in_namespaces(struct kdbus_test_args *kdbus_args) +{ + int ret; + int efd = -1; + int status; + pid_t pid, rpid; + struct sigaction oldsa; + struct sigaction sa = { + .sa_handler = nop_handler, + .sa_flags = SA_NOCLDSTOP, + }; + + efd = eventfd(0, EFD_CLOEXEC); + if (efd < 0) { + ret = -errno; + printf("eventfd() failed: %d (%m)\n", ret); + return TEST_ERR; + } + + ret = sigaction(SIGCHLD, &sa, &oldsa); + if (ret < 0) { + ret = -errno; + printf("sigaction() failed: %d (%m)\n", ret); + return TEST_ERR; + } + + /* setup namespaces */ + pid = syscall(__NR_clone, SIGCHLD| + (kdbus_args->userns ? CLONE_NEWUSER : 0) | + (kdbus_args->mntns ? CLONE_NEWNS : 0) | + (kdbus_args->pidns ? CLONE_NEWPID : 0), NULL); + if (pid < 0) { + printf("clone() failed: %d (%m)\n", -errno); + return TEST_ERR; + } + + if (pid == 0) { + eventfd_t event_status = 0; + + ret = prctl(PR_SET_PDEATHSIG, SIGKILL); + if (ret < 0) { + ret = -errno; + printf("error prctl(): %d (%m)\n", ret); + _exit(TEST_ERR); + } + + /* reset sighandlers of childs */ + ret = sigaction(SIGCHLD, &oldsa, NULL); + if (ret < 0) { + ret = -errno; + printf("sigaction() failed: %d (%m)\n", ret); + _exit(TEST_ERR); + } + + ret = eventfd_read(efd, &event_status); + if (ret < 0 || event_status != 1) { + printf("error eventfd_read()\n"); + _exit(TEST_ERR); + } + + if (kdbus_args->mntns) { + ret = test_prepare_mounts(kdbus_args); + if (ret < 0) { + printf("error preparing mounts\n"); + _exit(TEST_ERR); + } + } + + ret = run_tests(kdbus_args); + _exit(ret); + } + + /* Setup userns mapping */ + if (kdbus_args->userns) { + ret = userns_map_uid_gid(pid, kdbus_args->uid_map, + kdbus_args->gid_map); + if (ret < 0) { + printf("error mapping uid and gid in userns\n"); + eventfd_write(efd, 2); + return TEST_ERR; + } + } + + ret = eventfd_write(efd, 1); + if (ret < 0) { + ret = -errno; + printf("error eventfd_write(): %d (%m)\n", ret); + return TEST_ERR; + } + + rpid = waitpid(pid, &status, 0); + ASSERT_RETURN_VAL(rpid == pid, TEST_ERR); + + close(efd); + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) + return TEST_ERR; + + return TEST_OK; +} + +int start_tests(struct kdbus_test_args *kdbus_args) +{ + int ret; + bool namespaces; + static char fspath[4096]; + + namespaces = (kdbus_args->mntns || kdbus_args->pidns || + kdbus_args->userns); + + /* for pidns we need mntns set */ + if (kdbus_args->pidns && !kdbus_args->mntns) { + printf("Failed: please set both pid and mnt namesapces\n"); + return TEST_ERR; + } + + if (kdbus_args->userns) { + if (!config_user_ns_is_enabled()) { + printf("User namespace not supported\n"); + return TEST_ERR; + } + + if (!kdbus_args->uid_map || !kdbus_args->gid_map) { + printf("Failed: please specify uid or gid mapping\n"); + return TEST_ERR; + } + } + + print_kdbus_test_args(kdbus_args); + print_metadata_support(); + + /* setup kdbus paths */ + if (!kdbus_args->module) + kdbus_args->module = "kdbus"; + + if (!kdbus_args->root) { + snprintf(fspath, sizeof(fspath), "/sys/fs/%s", + kdbus_args->module); + kdbus_args->root = fspath; + } + + /* Start tests */ + if (namespaces) + ret = run_tests_in_namespaces(kdbus_args); + else + ret = run_tests(kdbus_args); + + return ret; +} + +int main(int argc, char *argv[]) +{ + int t, ret = 0; + struct kdbus_test_args *kdbus_args; + enum { + ARG_MNTNS = 0x100, + ARG_PIDNS, + ARG_USERNS, + ARG_UIDMAP, + ARG_GIDMAP, + }; + + kdbus_args = malloc(sizeof(*kdbus_args)); + if (!kdbus_args) { + printf("unable to malloc() kdbus_args\n"); + return EXIT_FAILURE; + } + + memset(kdbus_args, 0, sizeof(*kdbus_args)); + + static const struct option options[] = { + { "loop", no_argument, NULL, 'x' }, + { "help", no_argument, NULL, 'h' }, + { "root", required_argument, NULL, 'r' }, + { "test", required_argument, NULL, 't' }, + { "bus", required_argument, NULL, 'b' }, + { "wait", required_argument, NULL, 'w' }, + { "fork", no_argument, NULL, 'f' }, + { "module", required_argument, NULL, 'm' }, + { "tap", no_argument, NULL, 'a' }, + { "mntns", no_argument, NULL, ARG_MNTNS }, + { "pidns", no_argument, NULL, ARG_PIDNS }, + { "userns", no_argument, NULL, ARG_USERNS }, + { "uidmap", required_argument, NULL, ARG_UIDMAP }, + { "gidmap", required_argument, NULL, ARG_GIDMAP }, + {} + }; + + srand(time(NULL)); + + while ((t = getopt_long(argc, argv, "hxfm:r:t:b:w:a", options, NULL)) >= 0) { + switch (t) { + case 'x': + kdbus_args->loop = 1; + break; + + case 'm': + kdbus_args->module = optarg; + break; + + case 'r': + kdbus_args->root = optarg; + break; + + case 't': + kdbus_args->test = optarg; + break; + + case 'b': + kdbus_args->busname = optarg; + break; + + case 'w': + kdbus_args->wait = strtol(optarg, NULL, 10); + break; + + case 'f': + kdbus_args->fork = 1; + break; + + case 'a': + kdbus_args->tap_output = 1; + break; + + case ARG_MNTNS: + kdbus_args->mntns = true; + break; + + case ARG_PIDNS: + kdbus_args->pidns = true; + break; + + case ARG_USERNS: + kdbus_args->userns = true; + break; + + case ARG_UIDMAP: + kdbus_args->uid_map = optarg; + break; + + case ARG_GIDMAP: + kdbus_args->gid_map = optarg; + break; + + default: + case 'h': + usage(argv[0]); + } + } + + ret = start_tests(kdbus_args); + if (ret == TEST_ERR) + return EXIT_FAILURE; + + free(kdbus_args); + + return 0; +} diff -Nur linux-4.2/tools/testing/selftests/kdbus/kdbus-test.h linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-test.h --- linux-4.2/tools/testing/selftests/kdbus/kdbus-test.h 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-test.h 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,84 @@ +#ifndef _TEST_KDBUS_H_ +#define _TEST_KDBUS_H_ + +struct kdbus_test_env { + char *buspath; + const char *root; + const char *module; + int control_fd; + struct kdbus_conn *conn; +}; + +enum { + TEST_OK, + TEST_SKIP, + TEST_ERR, +}; + +#define ASSERT_RETURN_VAL(cond, val) \ + if (!(cond)) { \ + fprintf(stderr, "Assertion '%s' failed in %s(), %s:%d\n", \ + #cond, __func__, __FILE__, __LINE__); \ + return val; \ + } + +#define ASSERT_EXIT_VAL(cond, val) \ + if (!(cond)) { \ + fprintf(stderr, "Assertion '%s' failed in %s(), %s:%d\n", \ + #cond, __func__, __FILE__, __LINE__); \ + _exit(val); \ + } + +#define ASSERT_BREAK(cond) \ + if (!(cond)) { \ + fprintf(stderr, "Assertion '%s' failed in %s(), %s:%d\n", \ + #cond, __func__, __FILE__, __LINE__); \ + break; \ + } + +#define ASSERT_RETURN(cond) \ + ASSERT_RETURN_VAL(cond, TEST_ERR) + +#define ASSERT_EXIT(cond) \ + ASSERT_EXIT_VAL(cond, EXIT_FAILURE) + +int kdbus_test_activator(struct kdbus_test_env *env); +int kdbus_test_benchmark(struct kdbus_test_env *env); +int kdbus_test_benchmark_nomemfds(struct kdbus_test_env *env); +int kdbus_test_benchmark_uds(struct kdbus_test_env *env); +int kdbus_test_bus_make(struct kdbus_test_env *env); +int kdbus_test_byebye(struct kdbus_test_env *env); +int kdbus_test_chat(struct kdbus_test_env *env); +int kdbus_test_conn_info(struct kdbus_test_env *env); +int kdbus_test_conn_update(struct kdbus_test_env *env); +int kdbus_test_daemon(struct kdbus_test_env *env); +int kdbus_test_custom_endpoint(struct kdbus_test_env *env); +int kdbus_test_fd_passing(struct kdbus_test_env *env); +int kdbus_test_free(struct kdbus_test_env *env); +int kdbus_test_hello(struct kdbus_test_env *env); +int kdbus_test_match_bloom(struct kdbus_test_env *env); +int kdbus_test_match_id_add(struct kdbus_test_env *env); +int kdbus_test_match_id_remove(struct kdbus_test_env *env); +int kdbus_test_match_replace(struct kdbus_test_env *env); +int kdbus_test_match_name_add(struct kdbus_test_env *env); +int kdbus_test_match_name_change(struct kdbus_test_env *env); +int kdbus_test_match_name_remove(struct kdbus_test_env *env); +int kdbus_test_message_basic(struct kdbus_test_env *env); +int kdbus_test_message_prio(struct kdbus_test_env *env); +int kdbus_test_message_quota(struct kdbus_test_env *env); +int kdbus_test_memory_access(struct kdbus_test_env *env); +int kdbus_test_metadata_ns(struct kdbus_test_env *env); +int kdbus_test_monitor(struct kdbus_test_env *env); +int kdbus_test_name_basic(struct kdbus_test_env *env); +int kdbus_test_name_conflict(struct kdbus_test_env *env); +int kdbus_test_name_queue(struct kdbus_test_env *env); +int kdbus_test_name_takeover(struct kdbus_test_env *env); +int kdbus_test_policy(struct kdbus_test_env *env); +int kdbus_test_policy_ns(struct kdbus_test_env *env); +int kdbus_test_policy_priv(struct kdbus_test_env *env); +int kdbus_test_sync_byebye(struct kdbus_test_env *env); +int kdbus_test_sync_reply(struct kdbus_test_env *env); +int kdbus_test_timeout(struct kdbus_test_env *env); +int kdbus_test_writable_pool(struct kdbus_test_env *env); + +#endif /* _TEST_KDBUS_H_ */ diff -Nur linux-4.2/tools/testing/selftests/kdbus/kdbus-util.c linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-util.c --- linux-4.2/tools/testing/selftests/kdbus/kdbus-util.c 1969-12-31 21:00:00.000000000 -0300 +++ linux-4.2-pck/tools/testing/selftests/kdbus/kdbus-util.c 2015-09-08 00:48:22.245029757 -0300 @@ -0,0 +1,1612 @@ +/* + * Copyright (C) 2013-2015 Daniel Mack + * Copyright (C) 2013-2015 Kay Sievers + * Copyright (C) 2014-2015 Djalal Harouni + * + * kdbus is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __NR_memfd_create + #ifdef __x86_64__ + #define __NR_memfd_create 319 + #elif defined __arm__ + #define __NR_memfd_create 385 + #else + #define __NR_memfd_create 356 + #endif +#endif + +#include "kdbus-api.h" +#include "kdbus-util.h" +#include "kdbus-enum.h" + +#ifndef F_ADD_SEALS +#define F_LINUX_SPECIFIC_BASE 1024 +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +#endif + +int kdbus_util_verbose = true; + +int kdbus_sysfs_get_parameter_mask(const char *path, uint64_t *mask) +{ + int ret; + FILE *file; + unsigned long long value; + + file = fopen(path, "r"); + if (!file) { + ret = -errno; + kdbus_printf("--- error fopen(): %d (%m)\n", ret); + return ret; + } + + ret = fscanf(file, "%llu", &value); + if (ret != 1) { + if (ferror(file)) + ret = -errno; + else + ret = -EIO; + + kdbus_printf("--- error fscanf(): %d\n", ret); + fclose(file); + return ret; + } + + *mask = (uint64_t)value; + + fclose(file); + + return 0; +} + +int kdbus_sysfs_set_parameter_mask(const char *path, uint64_t mask) +{ + int ret; + FILE *file; + + file = fopen(path, "w"); + if (!file) { + ret = -errno; + kdbus_printf("--- error open(): %d (%m)\n", ret); + return ret; + } + + ret = fprintf(file, "%llu", (unsigned long long)mask); + if (ret <= 0) { + ret = -EIO; + kdbus_printf("--- error fprintf(): %d\n", ret); + } + + fclose(file); + + return ret > 0 ? 0 : ret; +} + +int kdbus_create_bus(int control_fd, const char *name, + uint64_t owner_meta, char **path) +{ + struct { + struct kdbus_cmd cmd; + + /* bloom size item */ + struct { + uint64_t size; + uint64_t type; + struct kdbus_bloom_parameter bloom; + } bp; + + /* owner metadata items */ + struct { + uint64_t size; + uint64_t type; + uint64_t flags; + } attach; + + /* name item */ + struct { + uint64_t size; + uint64_t type; + char str[64]; + } name; + } bus_make; + int ret; + + memset(&bus_make, 0, sizeof(bus_make)); + bus_make.bp.size = sizeof(bus_make.bp); + bus_make.bp.type = KDBUS_ITEM_BLOOM_PARAMETER; + bus_make.bp.bloom.size = 64; + bus_make.bp.bloom.n_hash = 1; + + snprintf(bus_make.name.str, sizeof(bus_make.name.str), + "%u-%s", getuid(), name); + + bus_make.attach.type = KDBUS_ITEM_ATTACH_FLAGS_SEND; + bus_make.attach.size = sizeof(bus_make.attach); + bus_make.attach.flags = owner_meta; + + bus_make.name.type = KDBUS_ITEM_MAKE_NAME; + bus_make.name.size = KDBUS_ITEM_HEADER_SIZE + + strlen(bus_make.name.str) + 1; + + bus_make.cmd.flags = KDBUS_MAKE_ACCESS_WORLD; + bus_make.cmd.size = sizeof(bus_make.cmd) + + bus_make.bp.size + + bus_make.attach.size + + bus_make.name.size; + + kdbus_printf("Creating bus with name >%s< on control fd %d ...\n", + name, control_fd); + + ret = kdbus_cmd_bus_make(control_fd, &bus_make.cmd); + if (ret < 0) { + kdbus_printf("--- error when making bus: %d (%m)\n", ret); + return ret; + } + + if (ret == 0 && path) + *path = strdup(bus_make.name.str); + + return ret; +} + +struct kdbus_conn * +kdbus_hello(const char *path, uint64_t flags, + const struct kdbus_item *item, size_t item_size) +{ + struct kdbus_cmd_free cmd_free = {}; + int fd, ret; + struct { + struct kdbus_cmd_hello hello; + + struct { + uint64_t size; + uint64_t type; + char str[16]; + } conn_name; + + uint8_t extra_items[item_size]; + } h; + struct kdbus_conn *conn; + + memset(&h, 0, sizeof(h)); + + if (item_size > 0) + memcpy(h.extra_items, item, item_size); + + kdbus_printf("-- opening bus connection %s\n", path); + fd = open(path, O_RDWR|O_CLOEXEC); + if (fd < 0) { + kdbus_printf("--- error %d (%m)\n", fd); + return NULL; + } + + h.hello.flags = flags | KDBUS_HELLO_ACCEPT_FD; + h.hello.attach_flags_send = _KDBUS_ATTACH_ALL; + h.hello.attach_flags_recv = _KDBUS_ATTACH_ALL; + h.conn_name.type = KDBUS_ITEM_CONN_DESCRIPTION; + strcpy(h.conn_name.str, "this-is-my-name"); + h.conn_name.size = KDBUS_ITEM_HEADER_SIZE + strlen(h.conn_name.str) + 1; + + h.hello.size = sizeof(h); + h.hello.pool_size = POOL_SIZE; + + ret = kdbus_cmd_hello(fd, (struct kdbus_cmd_hello *) &h.hello); + if (ret < 0) { + kdbus_printf("--- error when saying hello: %d (%m)\n", ret); + return NULL; + } + kdbus_printf("-- Our peer ID for %s: %llu -- bus uuid: '%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x'\n", + path, (unsigned long long)h.hello.id, + h.hello.id128[0], h.hello.id128[1], h.hello.id128[2], + h.hello.id128[3], h.hello.id128[4], h.hello.id128[5], + h.hello.id128[6], h.hello.id128[7], h.hello.id128[8], + h.hello.id128[9], h.hello.id128[10], h.hello.id128[11], + h.hello.id128[12], h.hello.id128[13], h.hello.id128[14], + h.hello.id128[15]); + + cmd_free.size = sizeof(cmd_free); + cmd_free.offset = h.hello.offset; + kdbus_cmd_free(fd, &cmd_free); + + conn = malloc(sizeof(*conn)); + if (!conn) { + kdbus_printf("unable to malloc()!?\n"); + return NULL; + } + + conn->buf = mmap(NULL, POOL_SIZE, PROT_READ, MAP_SHARED, fd, 0); + if (conn->buf == MAP_FAILED) { + free(conn); + close(fd); + kdbus_printf("--- error mmap (%m)\n"); + return NULL; + } + + conn->fd = fd; + conn->id = h.hello.id; + return conn; +} + +struct kdbus_conn * +kdbus_hello_registrar(const char *path, const char *name, + const struct kdbus_policy_access *access, + size_t num_access, uint64_t flags) +{ + struct kdbus_item *item, *items; + size_t i, size; + + size = KDBUS_ITEM_SIZE(strlen(name) + 1) + + num_access * KDBUS_ITEM_SIZE(sizeof(*access)); + + items = alloca(size); + + item = items; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + item->type = KDBUS_ITEM_NAME; + strcpy(item->str, name); + item = KDBUS_ITEM_NEXT(item); + + for (i = 0; i < num_access; i++) { + item->size = KDBUS_ITEM_HEADER_SIZE + + sizeof(struct kdbus_policy_access); + item->type = KDBUS_ITEM_POLICY_ACCESS; + + item->policy_access.type = access[i].type; + item->policy_access.access = access[i].access; + item->policy_access.id = access[i].id; + + item = KDBUS_ITEM_NEXT(item); + } + + return kdbus_hello(path, flags, items, size); +} + +struct kdbus_conn *kdbus_hello_activator(const char *path, const char *name, + const struct kdbus_policy_access *access, + size_t num_access) +{ + return kdbus_hello_registrar(path, name, access, num_access, + KDBUS_HELLO_ACTIVATOR); +} + +bool kdbus_item_in_message(struct kdbus_msg *msg, uint64_t type) +{ + const struct kdbus_item *item; + + KDBUS_ITEM_FOREACH(item, msg, items) + if (item->type == type) + return true; + + return false; +} + +int kdbus_bus_creator_info(struct kdbus_conn *conn, + uint64_t flags, + uint64_t *offset) +{ + struct kdbus_cmd_info *cmd; + size_t size = sizeof(*cmd); + int ret; + + cmd = alloca(size); + memset(cmd, 0, size); + cmd->size = size; + cmd->attach_flags = flags; + + ret = kdbus_cmd_bus_creator_info(conn->fd, cmd); + if (ret < 0) { + kdbus_printf("--- error when requesting info: %d (%m)\n", ret); + return ret; + } + + if (offset) + *offset = cmd->offset; + else + kdbus_free(conn, cmd->offset); + + return 0; +} + +int kdbus_conn_info(struct kdbus_conn *conn, uint64_t id, + const char *name, uint64_t flags, + uint64_t *offset) +{ + struct kdbus_cmd_info *cmd; + size_t size = sizeof(*cmd); + struct kdbus_info *info; + int ret; + + if (name) + size += KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + + cmd = alloca(size); + memset(cmd, 0, size); + cmd->size = size; + cmd->attach_flags = flags; + + if (name) { + cmd->items[0].size = KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + cmd->items[0].type = KDBUS_ITEM_NAME; + strcpy(cmd->items[0].str, name); + } else { + cmd->id = id; + } + + ret = kdbus_cmd_conn_info(conn->fd, cmd); + if (ret < 0) { + kdbus_printf("--- error when requesting info: %d (%m)\n", ret); + return ret; + } + + info = (struct kdbus_info *) (conn->buf + cmd->offset); + if (info->size != cmd->info_size) { + kdbus_printf("%s(): size mismatch: %d != %d\n", __func__, + (int) info->size, (int) cmd->info_size); + return -EIO; + } + + if (offset) + *offset = cmd->offset; + else + kdbus_free(conn, cmd->offset); + + return 0; +} + +void kdbus_conn_free(struct kdbus_conn *conn) +{ + if (!conn) + return; + + if (conn->buf) + munmap(conn->buf, POOL_SIZE); + + if (conn->fd >= 0) + close(conn->fd); + + free(conn); +} + +int sys_memfd_create(const char *name, __u64 size) +{ + int ret, fd; + + fd = syscall(__NR_memfd_create, name, MFD_ALLOW_SEALING); + if (fd < 0) + return fd; + + ret = ftruncate(fd, size); + if (ret < 0) { + close(fd); + return ret; + } + + return fd; +} + +int sys_memfd_seal_set(int fd) +{ + return fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK | + F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_SEAL); +} + +off_t sys_memfd_get_size(int fd, off_t *size) +{ + struct stat stat; + int ret; + + ret = fstat(fd, &stat); + if (ret < 0) { + kdbus_printf("stat() failed: %m\n"); + return ret; + } + + *size = stat.st_size; + return 0; +} + +static int __kdbus_msg_send(const struct kdbus_conn *conn, + const char *name, + uint64_t cookie, + uint64_t flags, + uint64_t timeout, + int64_t priority, + uint64_t dst_id, + uint64_t cmd_flags, + int cancel_fd) +{ + struct kdbus_cmd_send *cmd = NULL; + struct kdbus_msg *msg = NULL; + const char ref1[1024 * 128 + 3] = "0123456789_0"; + const char ref2[] = "0123456789_1"; + struct kdbus_item *item; + struct timespec now; + uint64_t size; + int memfd = -1; + int ret; + + size = sizeof(*msg) + 3 * KDBUS_ITEM_SIZE(sizeof(struct kdbus_vec)); + + if (dst_id == KDBUS_DST_ID_BROADCAST) + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_bloom_filter)) + 64; + else { + memfd = sys_memfd_create("my-name-is-nice", 1024 * 1024); + if (memfd < 0) { + kdbus_printf("failed to create memfd: %m\n"); + return memfd; + } + + if (write(memfd, "kdbus memfd 1234567", 19) != 19) { + ret = -errno; + kdbus_printf("writing to memfd failed: %m\n"); + goto out; + } + + ret = sys_memfd_seal_set(memfd); + if (ret < 0) { + ret = -errno; + kdbus_printf("memfd sealing failed: %m\n"); + goto out; + } + + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_memfd)); + } + + if (name) + size += KDBUS_ITEM_SIZE(strlen(name) + 1); + + msg = malloc(size); + if (!msg) { + ret = -errno; + kdbus_printf("unable to malloc()!?\n"); + goto out; + } + + if (dst_id == KDBUS_DST_ID_BROADCAST) + flags |= KDBUS_MSG_SIGNAL; + + memset(msg, 0, size); + msg->flags = flags; + msg->priority = priority; + msg->size = size; + msg->src_id = conn->id; + msg->dst_id = name ? 0 : dst_id; + msg->cookie = cookie; + msg->payload_type = KDBUS_PAYLOAD_DBUS; + + if (timeout) { + ret = clock_gettime(CLOCK_MONOTONIC_COARSE, &now); + if (ret < 0) + goto out; + + msg->timeout_ns = now.tv_sec * 1000000000ULL + + now.tv_nsec + timeout; + } + + item = msg->items; + + if (name) { + item->type = KDBUS_ITEM_DST_NAME; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + strcpy(item->str, name); + item = KDBUS_ITEM_NEXT(item); + } + + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)&ref1; + item->vec.size = sizeof(ref1); + item = KDBUS_ITEM_NEXT(item); + + /* data padding for ref1 */ + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)NULL; + item->vec.size = KDBUS_ALIGN8(sizeof(ref1)) - sizeof(ref1); + item = KDBUS_ITEM_NEXT(item); + + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)&ref2; + item->vec.size = sizeof(ref2); + item = KDBUS_ITEM_NEXT(item); + + if (dst_id == KDBUS_DST_ID_BROADCAST) { + item->type = KDBUS_ITEM_BLOOM_FILTER; + item->size = KDBUS_ITEM_SIZE(sizeof(struct kdbus_bloom_filter)) + 64; + item->bloom_filter.generation = 0; + } else { + item->type = KDBUS_ITEM_PAYLOAD_MEMFD; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_memfd); + item->memfd.size = 16; + item->memfd.fd = memfd; + } + item = KDBUS_ITEM_NEXT(item); + + size = sizeof(*cmd); + if (cancel_fd != -1) + size += KDBUS_ITEM_SIZE(sizeof(cancel_fd)); + + cmd = malloc(size); + if (!cmd) { + ret = -errno; + kdbus_printf("unable to malloc()!?\n"); + goto out; + } + + cmd->size = size; + cmd->flags = cmd_flags; + cmd->msg_address = (uintptr_t)msg; + + item = cmd->items; + + if (cancel_fd != -1) { + item->type = KDBUS_ITEM_CANCEL_FD; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(cancel_fd); + item->fds[0] = cancel_fd; + item = KDBUS_ITEM_NEXT(item); + } + + ret = kdbus_cmd_send(conn->fd, cmd); + if (ret < 0) { + kdbus_printf("error sending message: %d (%m)\n", ret); + goto out; + } + + if (cmd_flags & KDBUS_SEND_SYNC_REPLY) { + struct kdbus_msg *reply; + + kdbus_printf("SYNC REPLY @offset %llu:\n", cmd->reply.offset); + reply = (struct kdbus_msg *)(conn->buf + cmd->reply.offset); + kdbus_msg_dump(conn, reply); + + kdbus_msg_free(reply); + + ret = kdbus_free(conn, cmd->reply.offset); + if (ret < 0) + goto out; + } + +out: + free(msg); + free(cmd); + + if (memfd >= 0) + close(memfd); + + return ret < 0 ? ret : 0; +} + +int kdbus_msg_send(const struct kdbus_conn *conn, const char *name, + uint64_t cookie, uint64_t flags, uint64_t timeout, + int64_t priority, uint64_t dst_id) +{ + return __kdbus_msg_send(conn, name, cookie, flags, timeout, priority, + dst_id, 0, -1); +} + +int kdbus_msg_send_sync(const struct kdbus_conn *conn, const char *name, + uint64_t cookie, uint64_t flags, uint64_t timeout, + int64_t priority, uint64_t dst_id, int cancel_fd) +{ + return __kdbus_msg_send(conn, name, cookie, flags, timeout, priority, + dst_id, KDBUS_SEND_SYNC_REPLY, cancel_fd); +} + +int kdbus_msg_send_reply(const struct kdbus_conn *conn, + uint64_t reply_cookie, + uint64_t dst_id) +{ + struct kdbus_cmd_send cmd = {}; + struct kdbus_msg *msg; + const char ref1[1024 * 128 + 3] = "0123456789_0"; + struct kdbus_item *item; + uint64_t size; + int ret; + + size = sizeof(struct kdbus_msg); + size += KDBUS_ITEM_SIZE(sizeof(struct kdbus_vec)); + + msg = malloc(size); + if (!msg) { + kdbus_printf("unable to malloc()!?\n"); + return -ENOMEM; + } + + memset(msg, 0, size); + msg->size = size; + msg->src_id = conn->id; + msg->dst_id = dst_id; + msg->cookie_reply = reply_cookie; + msg->payload_type = KDBUS_PAYLOAD_DBUS; + + item = msg->items; + + item->type = KDBUS_ITEM_PAYLOAD_VEC; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(struct kdbus_vec); + item->vec.address = (uintptr_t)&ref1; + item->vec.size = sizeof(ref1); + item = KDBUS_ITEM_NEXT(item); + + cmd.size = sizeof(cmd); + cmd.msg_address = (uintptr_t)msg; + + ret = kdbus_cmd_send(conn->fd, &cmd); + if (ret < 0) + kdbus_printf("error sending message: %d (%m)\n", ret); + + free(msg); + + return ret; +} + +static char *msg_id(uint64_t id, char *buf) +{ + if (id == 0) + return "KERNEL"; + if (id == ~0ULL) + return "BROADCAST"; + sprintf(buf, "%llu", (unsigned long long)id); + return buf; +} + +int kdbus_msg_dump(const struct kdbus_conn *conn, const struct kdbus_msg *msg) +{ + const struct kdbus_item *item = msg->items; + char buf_src[32]; + char buf_dst[32]; + uint64_t timeout = 0; + uint64_t cookie_reply = 0; + int ret = 0; + + if (msg->flags & KDBUS_MSG_EXPECT_REPLY) + timeout = msg->timeout_ns; + else + cookie_reply = msg->cookie_reply; + + kdbus_printf("MESSAGE: %s (%llu bytes) flags=0x%08llx, %s → %s, " + "cookie=%llu, timeout=%llu cookie_reply=%llu priority=%lli\n", + enum_PAYLOAD(msg->payload_type), (unsigned long long)msg->size, + (unsigned long long)msg->flags, + msg_id(msg->src_id, buf_src), msg_id(msg->dst_id, buf_dst), + (unsigned long long)msg->cookie, (unsigned long long)timeout, + (unsigned long long)cookie_reply, (long long)msg->priority); + + KDBUS_ITEM_FOREACH(item, msg, items) { + if (item->size < KDBUS_ITEM_HEADER_SIZE) { + kdbus_printf(" +%s (%llu bytes) invalid data record\n", + enum_MSG(item->type), item->size); + ret = -EINVAL; + break; + } + + switch (item->type) { + case KDBUS_ITEM_PAYLOAD_OFF: { + char *s; + + if (item->vec.offset == ~0ULL) + s = "[\\0-bytes]"; + else + s = (char *)msg + item->vec.offset; + + kdbus_printf(" +%s (%llu bytes) off=%llu size=%llu '%s'\n", + enum_MSG(item->type), item->size, + (unsigned long long)item->vec.offset, + (unsigned long long)item->vec.size, s); + break; + } + + case KDBUS_ITEM_FDS: { + int i, n = (item->size - KDBUS_ITEM_HEADER_SIZE) / + sizeof(int); + + kdbus_printf(" +%s (%llu bytes, %d fds)\n", + enum_MSG(item->type), item->size, n); + + for (i = 0; i < n; i++) + kdbus_printf(" fd[%d] = %d\n", + i, item->fds[i]); + + break; + } + + case KDBUS_ITEM_PAYLOAD_MEMFD: { + char *buf; + off_t size; + + buf = mmap(NULL, item->memfd.size, PROT_READ, + MAP_PRIVATE, item->memfd.fd, 0); + if (buf == MAP_FAILED) { + kdbus_printf("mmap() fd=%i size=%llu failed: %m\n", + item->memfd.fd, item->memfd.size); + break; + } + + if (sys_memfd_get_size(item->memfd.fd, &size) < 0) { + kdbus_printf("KDBUS_CMD_MEMFD_SIZE_GET failed: %m\n"); + break; + } + + kdbus_printf(" +%s (%llu bytes) fd=%i size=%llu filesize=%llu '%s'\n", + enum_MSG(item->type), item->size, item->memfd.fd, + (unsigned long long)item->memfd.size, + (unsigned long long)size, buf); + munmap(buf, item->memfd.size); + break; + } + + case KDBUS_ITEM_CREDS: + kdbus_printf(" +%s (%llu bytes) uid=%lld, euid=%lld, suid=%lld, fsuid=%lld, " + "gid=%lld, egid=%lld, sgid=%lld, fsgid=%lld\n", + enum_MSG(item->type), item->size, + item->creds.uid, item->creds.euid, + item->creds.suid, item->creds.fsuid, + item->creds.gid, item->creds.egid, + item->creds.sgid, item->creds.fsgid); + break; + + case KDBUS_ITEM_PIDS: + kdbus_printf(" +%s (%llu bytes) pid=%lld, tid=%lld, ppid=%lld\n", + enum_MSG(item->type), item->size, + item->pids.pid, item->pids.tid, + item->pids.ppid); + break; + + case KDBUS_ITEM_AUXGROUPS: { + int i, n; + + kdbus_printf(" +%s (%llu bytes)\n", + enum_MSG(item->type), item->size); + n = (item->size - KDBUS_ITEM_HEADER_SIZE) / + sizeof(uint64_t); + + for (i = 0; i < n; i++) + kdbus_printf(" gid[%d] = %lld\n", + i, item->data64[i]); + break; + } + + case KDBUS_ITEM_NAME: + case KDBUS_ITEM_PID_COMM: + case KDBUS_ITEM_TID_COMM: + case KDBUS_ITEM_EXE: + case KDBUS_ITEM_CGROUP: + case KDBUS_ITEM_SECLABEL: + case KDBUS_ITEM_DST_NAME: + case KDBUS_ITEM_CONN_DESCRIPTION: + kdbus_printf(" +%s (%llu bytes) '%s' (%zu)\n", + enum_MSG(item->type), item->size, + item->str, strlen(item->str)); + break; + + case KDBUS_ITEM_OWNED_NAME: { + kdbus_printf(" +%s (%llu bytes) '%s' (%zu) flags=0x%08llx\n", + enum_MSG(item->type), item->size, + item->name.name, strlen(item->name.name), + item->name.flags); + break; + } + + case KDBUS_ITEM_CMDLINE: { + size_t size = item->size - KDBUS_ITEM_HEADER_SIZE; + const char *str = item->str; + int count = 0; + + kdbus_printf(" +%s (%llu bytes) ", + enum_MSG(item->type), item->size); + while (size) { + kdbus_printf("'%s' ", str); + size -= strlen(str) + 1; + str += strlen(str) + 1; + count++; + } + + kdbus_printf("(%d string%s)\n", + count, (count == 1) ? "" : "s"); + break; + } + + case KDBUS_ITEM_AUDIT: + kdbus_printf(" +%s (%llu bytes) loginuid=%u sessionid=%u\n", + enum_MSG(item->type), item->size, + item->audit.loginuid, item->audit.sessionid); + break; + + case KDBUS_ITEM_CAPS: { + const uint32_t *cap; + int n, i; + + kdbus_printf(" +%s (%llu bytes) len=%llu bytes, last_cap %d\n", + enum_MSG(item->type), item->size, + (unsigned long long)item->size - + KDBUS_ITEM_HEADER_SIZE, + (int) item->caps.last_cap); + + cap = item->caps.caps; + n = (item->size - offsetof(struct kdbus_item, caps.caps)) + / 4 / sizeof(uint32_t); + + kdbus_printf(" CapInh="); + for (i = 0; i < n; i++) + kdbus_printf("%08x", cap[(0 * n) + (n - i - 1)]); + + kdbus_printf(" CapPrm="); + for (i = 0; i < n; i++) + kdbus_printf("%08x", cap[(1 * n) + (n - i - 1)]); + + kdbus_printf(" CapEff="); + for (i = 0; i < n; i++) + kdbus_printf("%08x", cap[(2 * n) + (n - i - 1)]); + + kdbus_printf(" CapBnd="); + for (i = 0; i < n; i++) + kdbus_printf("%08x", cap[(3 * n) + (n - i - 1)]); + kdbus_printf("\n"); + break; + } + + case KDBUS_ITEM_TIMESTAMP: + kdbus_printf(" +%s (%llu bytes) seq=%llu realtime=%lluns monotonic=%lluns\n", + enum_MSG(item->type), item->size, + (unsigned long long)item->timestamp.seqnum, + (unsigned long long)item->timestamp.realtime_ns, + (unsigned long long)item->timestamp.monotonic_ns); + break; + + case KDBUS_ITEM_REPLY_TIMEOUT: + kdbus_printf(" +%s (%llu bytes) cookie=%llu\n", + enum_MSG(item->type), item->size, + msg->cookie_reply); + break; + + case KDBUS_ITEM_NAME_ADD: + case KDBUS_ITEM_NAME_REMOVE: + case KDBUS_ITEM_NAME_CHANGE: + kdbus_printf(" +%s (%llu bytes) '%s', old id=%lld, now id=%lld, old_flags=0x%llx new_flags=0x%llx\n", + enum_MSG(item->type), + (unsigned long long) item->size, + item->name_change.name, + item->name_change.old_id.id, + item->name_change.new_id.id, + item->name_change.old_id.flags, + item->name_change.new_id.flags); + break; + + case KDBUS_ITEM_ID_ADD: + case KDBUS_ITEM_ID_REMOVE: + kdbus_printf(" +%s (%llu bytes) id=%llu flags=%llu\n", + enum_MSG(item->type), + (unsigned long long) item->size, + (unsigned long long) item->id_change.id, + (unsigned long long) item->id_change.flags); + break; + + default: + kdbus_printf(" +%s (%llu bytes)\n", + enum_MSG(item->type), item->size); + break; + } + } + + if ((char *)item - ((char *)msg + msg->size) >= 8) { + kdbus_printf("invalid padding at end of message\n"); + ret = -EINVAL; + } + + kdbus_printf("\n"); + + return ret; +} + +void kdbus_msg_free(struct kdbus_msg *msg) +{ + const struct kdbus_item *item; + int nfds, i; + + if (!msg) + return; + + KDBUS_ITEM_FOREACH(item, msg, items) { + switch (item->type) { + /* close all memfds */ + case KDBUS_ITEM_PAYLOAD_MEMFD: + close(item->memfd.fd); + break; + case KDBUS_ITEM_FDS: + nfds = (item->size - KDBUS_ITEM_HEADER_SIZE) / + sizeof(int); + + for (i = 0; i < nfds; i++) + close(item->fds[i]); + + break; + } + } +} + +int kdbus_msg_recv(struct kdbus_conn *conn, + struct kdbus_msg **msg_out, + uint64_t *offset) +{ + struct kdbus_cmd_recv recv = { .size = sizeof(recv) }; + struct kdbus_msg *msg; + int ret; + + ret = kdbus_cmd_recv(conn->fd, &recv); + if (ret < 0) + return ret; + + msg = (struct kdbus_msg *)(conn->buf + recv.msg.offset); + ret = kdbus_msg_dump(conn, msg); + if (ret < 0) { + kdbus_msg_free(msg); + return ret; + } + + if (msg_out) { + *msg_out = msg; + + if (offset) + *offset = recv.msg.offset; + } else { + kdbus_msg_free(msg); + + ret = kdbus_free(conn, recv.msg.offset); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * Returns: 0 on success, negative errno on failure. + * + * We must return -ETIMEDOUT, -ECONNREST, -EAGAIN and other errors. + * We must return the result of kdbus_msg_recv() + */ +int kdbus_msg_recv_poll(struct kdbus_conn *conn, + int timeout_ms, + struct kdbus_msg **msg_out, + uint64_t *offset) +{ + int ret; + + do { + struct timeval before, after, diff; + struct pollfd fd; + + fd.fd = conn->fd; + fd.events = POLLIN | POLLPRI | POLLHUP; + fd.revents = 0; + + gettimeofday(&before, NULL); + ret = poll(&fd, 1, timeout_ms); + gettimeofday(&after, NULL); + + if (ret == 0) { + ret = -ETIMEDOUT; + break; + } + + if (ret > 0) { + if (fd.revents & POLLIN) + ret = kdbus_msg_recv(conn, msg_out, offset); + + if (fd.revents & (POLLHUP | POLLERR)) + ret = -ECONNRESET; + } + + if (ret == 0 || ret != -EAGAIN) + break; + + timersub(&after, &before, &diff); + timeout_ms -= diff.tv_sec * 1000UL + + diff.tv_usec / 1000UL; + } while (timeout_ms > 0); + + return ret; +} + +int kdbus_free(const struct kdbus_conn *conn, uint64_t offset) +{ + struct kdbus_cmd_free cmd_free = {}; + int ret; + + cmd_free.size = sizeof(cmd_free); + cmd_free.offset = offset; + cmd_free.flags = 0; + + ret = kdbus_cmd_free(conn->fd, &cmd_free); + if (ret < 0) { + kdbus_printf("KDBUS_CMD_FREE failed: %d (%m)\n", ret); + return ret; + } + + return 0; +} + +int kdbus_name_acquire(struct kdbus_conn *conn, + const char *name, uint64_t *flags) +{ + struct kdbus_cmd *cmd_name; + size_t name_len = strlen(name) + 1; + uint64_t size = sizeof(*cmd_name) + KDBUS_ITEM_SIZE(name_len); + struct kdbus_item *item; + int ret; + + cmd_name = alloca(size); + + memset(cmd_name, 0, size); + + item = cmd_name->items; + item->size = KDBUS_ITEM_HEADER_SIZE + name_len; + item->type = KDBUS_ITEM_NAME; + strcpy(item->str, name); + + cmd_name->size = size; + if (flags) + cmd_name->flags = *flags; + + ret = kdbus_cmd_name_acquire(conn->fd, cmd_name); + if (ret < 0) { + kdbus_printf("error aquiring name: %s\n", strerror(-ret)); + return ret; + } + + kdbus_printf("%s(): flags after call: 0x%llx\n", __func__, + cmd_name->return_flags); + + if (flags) + *flags = cmd_name->return_flags; + + return 0; +} + +int kdbus_name_release(struct kdbus_conn *conn, const char *name) +{ + struct kdbus_cmd *cmd_name; + size_t name_len = strlen(name) + 1; + uint64_t size = sizeof(*cmd_name) + KDBUS_ITEM_SIZE(name_len); + struct kdbus_item *item; + int ret; + + cmd_name = alloca(size); + + memset(cmd_name, 0, size); + + item = cmd_name->items; + item->size = KDBUS_ITEM_HEADER_SIZE + name_len; + item->type = KDBUS_ITEM_NAME; + strcpy(item->str, name); + + cmd_name->size = size; + + kdbus_printf("conn %lld giving up name '%s'\n", + (unsigned long long) conn->id, name); + + ret = kdbus_cmd_name_release(conn->fd, cmd_name); + if (ret < 0) { + kdbus_printf("error releasing name: %s\n", strerror(-ret)); + return ret; + } + + return 0; +} + +int kdbus_list(struct kdbus_conn *conn, uint64_t flags) +{ + struct kdbus_cmd_list cmd_list = {}; + struct kdbus_info *list, *name; + int ret; + + cmd_list.size = sizeof(cmd_list); + cmd_list.flags = flags; + + ret = kdbus_cmd_list(conn->fd, &cmd_list); + if (ret < 0) { + kdbus_printf("error listing names: %d (%m)\n", ret); + return ret; + } + + kdbus_printf("REGISTRY:\n"); + list = (struct kdbus_info *)(conn->buf + cmd_list.offset); + + KDBUS_FOREACH(name, list, cmd_list.list_size) { + uint64_t flags = 0; + struct kdbus_item *item; + const char *n = "MISSING-NAME"; + + if (name->size == sizeof(struct kdbus_cmd)) + continue; + + KDBUS_ITEM_FOREACH(item, name, items) + if (item->type == KDBUS_ITEM_OWNED_NAME) { + n = item->name.name; + flags = item->name.flags; + + kdbus_printf("%8llu flags=0x%08llx conn=0x%08llx '%s'\n", + name->id, + (unsigned long long) flags, + name->flags, n); + } + } + kdbus_printf("\n"); + + ret = kdbus_free(conn, cmd_list.offset); + + return ret; +} + +int kdbus_conn_update_attach_flags(struct kdbus_conn *conn, + uint64_t attach_flags_send, + uint64_t attach_flags_recv) +{ + int ret; + size_t size; + struct kdbus_cmd *update; + struct kdbus_item *item; + + size = sizeof(struct kdbus_cmd); + size += KDBUS_ITEM_SIZE(sizeof(uint64_t)) * 2; + + update = malloc(size); + if (!update) { + kdbus_printf("error malloc: %m\n"); + return -ENOMEM; + } + + memset(update, 0, size); + update->size = size; + + item = update->items; + + item->type = KDBUS_ITEM_ATTACH_FLAGS_SEND; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(uint64_t); + item->data64[0] = attach_flags_send; + item = KDBUS_ITEM_NEXT(item); + + item->type = KDBUS_ITEM_ATTACH_FLAGS_RECV; + item->size = KDBUS_ITEM_HEADER_SIZE + sizeof(uint64_t); + item->data64[0] = attach_flags_recv; + item = KDBUS_ITEM_NEXT(item); + + ret = kdbus_cmd_update(conn->fd, update); + if (ret < 0) + kdbus_printf("error conn update: %d (%m)\n", ret); + + free(update); + + return ret; +} + +int kdbus_conn_update_policy(struct kdbus_conn *conn, const char *name, + const struct kdbus_policy_access *access, + size_t num_access) +{ + struct kdbus_cmd *update; + struct kdbus_item *item; + size_t i, size; + int ret; + + size = sizeof(struct kdbus_cmd); + size += KDBUS_ITEM_SIZE(strlen(name) + 1); + size += num_access * KDBUS_ITEM_SIZE(sizeof(struct kdbus_policy_access)); + + update = malloc(size); + if (!update) { + kdbus_printf("error malloc: %m\n"); + return -ENOMEM; + } + + memset(update, 0, size); + update->size = size; + + item = update->items; + + item->type = KDBUS_ITEM_NAME; + item->size = KDBUS_ITEM_HEADER_SIZE + strlen(name) + 1; + strcpy(item->str, name); + item = KDBUS_ITEM_NEXT(item); + + for (i = 0; i < num_access; i++) { + item->size = KDBUS_ITEM_HEADER_SIZE + + sizeof(struct kdbus_policy_access); + item->type = KDBUS_ITEM_POLICY_ACCESS; + + item->policy_access.type = access[i].type; + item->policy_access.access = access[i].access; + item->policy_access.id = access[i].id; + + item = KDBUS_ITEM_NEXT(item); + } + + ret = kdbus_cmd_update(conn->fd, update); + if (ret < 0) + kdbus_printf("error conn update: %d (%m)\n", ret); + + free(update); + + return ret; +} + +int kdbus_add_match_id(struct kdbus_conn *conn, uint64_t cookie, + uint64_t type, uint64_t id) +{ + struct { + struct kdbus_cmd_match cmd; + struct { + uint64_t size; + uint64_t type; + struct kdbus_notify_id_change chg; + } item; + } buf; + int ret; + + memset(&buf, 0, sizeof(buf)); + + buf.cmd.size = sizeof(buf); + buf.cmd.cookie = cookie; + buf.item.size = sizeof(buf.item); + buf.item.type = type; + buf.item.chg.id = id; + + ret = kdbus_cmd_match_add(conn->fd, &buf.cmd); + if (ret < 0) + kdbus_printf("--- error adding conn match: %d (%m)\n", ret); + + return ret; +} + +int kdbus_add_match_empty(struct kdbus_conn *conn) +{ + struct { + struct kdbus_cmd_match cmd; + struct kdbus_item item; + } buf; + int ret; + + memset(&buf, 0, sizeof(buf)); + + buf.item.size = sizeof(uint64_t) * 3; + buf.item.type = KDBUS_ITEM_ID; + buf.item.id = KDBUS_MATCH_ID_ANY; + + buf.cmd.size = sizeof(buf.cmd) + buf.item.size; + + ret = kdbus_cmd_match_add(conn->fd, &buf.cmd); + if (ret < 0) + kdbus_printf("--- error adding conn match: %d (%m)\n", ret); + + return ret; +} + +static int all_ids_are_mapped(const char *path) +{ + int ret; + FILE *file; + uint32_t inside_id, length; + + file = fopen(path, "r"); + if (!file) { + ret = -errno; + kdbus_printf("error fopen() %s: %d (%m)\n", + path, ret); + return ret; + } + + ret = fscanf(file, "%u\t%*u\t%u", &inside_id, &length); + if (ret != 2) { + if (ferror(file)) + ret = -errno; + else + ret = -EIO; + + kdbus_printf("--- error fscanf(): %d\n", ret); + fclose(file); + return ret; + } + + fclose(file); + + /* + * If length is 4294967295 which means the invalid uid + * (uid_t) -1 then we are able to map all uid/gids + */ + if (inside_id == 0 && length == (uid_t) -1) + return 1; + + return 0; +} + +int all_uids_gids_are_mapped(void) +{ + int ret; + + ret = all_ids_are_mapped("/proc/self/uid_map"); + if (ret <= 0) { + kdbus_printf("--- error not all uids are mapped\n"); + return 0; + } + + ret = all_ids_are_mapped("/proc/self/gid_map"); + if (ret <= 0) { + kdbus_printf("--- error not all gids are mapped\n"); + return 0; + } + + return 1; +} + +int drop_privileges(uid_t uid, gid_t gid) +{ + int ret; + + ret = setgroups(0, NULL); + if (ret < 0) { + ret = -errno; + kdbus_printf("error setgroups: %d (%m)\n", ret); + return ret; + } + + ret = setresgid(gid, gid, gid); + if (ret < 0) { + ret = -errno; + kdbus_printf("error setresgid: %d (%m)\n", ret); + return ret; + } + + ret = setresuid(uid, uid, uid); + if (ret < 0) { + ret = -errno; + kdbus_printf("error setresuid: %d (%m)\n", ret); + return ret; + } + + return ret; +} + +uint64_t now(clockid_t clock) +{ + struct timespec spec; + + clock_gettime(clock, &spec); + return spec.tv_sec * 1000ULL * 1000ULL * 1000ULL + spec.tv_nsec; +} + +char *unique_name(const char *prefix) +{ + unsigned int i; + uint64_t u_now; + char n[17]; + char *str; + int r; + + /* + * This returns a random string which is guaranteed to be + * globally unique across all calls to unique_name(). We + * compose the string as: + * --